use crate::error::{MLError, Result};
use scirs2_core::ndarray::{Array1, Array2, Axis};
use scirs2_core::random::prelude::*;
use scirs2_core::random::Rng;
use super::config::{
DimensionalityReduction, FeatureSelection, MissingValueStrategy, NoiseFiltering,
NormalizationType, PreprocessingConfig,
};
#[derive(Debug)]
pub struct DataPreprocessor {
config: PreprocessingConfig,
fitted: bool,
normalization_params: Option<NormalizationParams>,
feature_selector: Option<FeatureSelector>,
dimensionality_reducer: Option<DimensionalityReducer>,
}
#[derive(Debug, Clone)]
pub struct NormalizationParams {
pub means: Array1<f64>,
pub stds: Array1<f64>,
pub mins: Array1<f64>,
pub maxs: Array1<f64>,
}
#[derive(Debug)]
pub struct FeatureSelector {
pub selected_features: Vec<usize>,
pub feature_scores: Array1<f64>,
}
#[derive(Debug)]
pub struct DimensionalityReducer {
pub components: Array2<f64>,
pub explained_variance: Array1<f64>,
pub target_dim: usize,
}
impl DataPreprocessor {
pub fn new(config: PreprocessingConfig) -> Self {
DataPreprocessor {
config,
fitted: false,
normalization_params: None,
feature_selector: None,
dimensionality_reducer: None,
}
}
pub fn fit_transform(&mut self, data: &Array2<f64>) -> Result<Array2<f64>> {
self.fit(data)?;
self.transform(data)
}
pub fn fit(&mut self, data: &Array2<f64>) -> Result<()> {
self.normalization_params = Some(self.compute_normalization_params(data));
let mut current_data = data.clone();
if let Some(ref params) = self.normalization_params {
current_data = self.apply_normalization(¤t_data, params)?;
}
if self.config.feature_selection.is_some() {
self.feature_selector = Some(self.fit_feature_selector(¤t_data)?);
if let Some(ref selector) = self.feature_selector {
current_data = self.apply_feature_selection(¤t_data, selector)?;
}
}
if self.config.dimensionality_reduction.is_some() {
self.dimensionality_reducer = Some(self.fit_dimensionality_reducer(¤t_data)?);
}
self.fitted = true;
Ok(())
}
pub fn transform(&self, data: &Array2<f64>) -> Result<Array2<f64>> {
if !self.fitted {
return Err(MLError::MLOperationError(
"Preprocessor must be fitted before transform".to_string(),
));
}
let mut transformed = data.clone();
if let Some(ref params) = self.normalization_params {
transformed = self.apply_normalization(&transformed, params)?;
}
if let Some(ref selector) = self.feature_selector {
transformed = self.apply_feature_selection(&transformed, selector)?;
}
if let Some(ref reducer) = self.dimensionality_reducer {
transformed = self.apply_dimensionality_reduction(&transformed, reducer)?;
}
Ok(transformed)
}
fn compute_normalization_params(&self, data: &Array2<f64>) -> NormalizationParams {
let n_features = data.ncols();
let mut means = Array1::zeros(n_features);
let mut stds = Array1::zeros(n_features);
let mut mins = Array1::zeros(n_features);
let mut maxs = Array1::zeros(n_features);
for j in 0..n_features {
let column = data.column(j);
means[j] = column.mean().unwrap_or(0.0);
stds[j] = column.std(0.0);
mins[j] = column.fold(f64::INFINITY, |a, &b| a.min(b));
maxs[j] = column.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
}
NormalizationParams {
means,
stds,
mins,
maxs,
}
}
fn apply_normalization(
&self,
data: &Array2<f64>,
params: &NormalizationParams,
) -> Result<Array2<f64>> {
let mut normalized = data.clone();
match self.config.normalization {
NormalizationType::ZScore => {
for j in 0..data.ncols() {
let mut column = normalized.column_mut(j);
if params.stds[j] > 1e-8 {
column.mapv_inplace(|x| (x - params.means[j]) / params.stds[j]);
}
}
}
NormalizationType::MinMax => {
for j in 0..data.ncols() {
let mut column = normalized.column_mut(j);
let range = params.maxs[j] - params.mins[j];
if range > 1e-8 {
column.mapv_inplace(|x| (x - params.mins[j]) / range);
}
}
}
NormalizationType::Robust => {
for j in 0..data.ncols() {
let mut column_data: Vec<f64> = data.column(j).to_vec();
column_data
.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if column_data.len() % 2 == 0 {
(column_data[column_data.len() / 2 - 1]
+ column_data[column_data.len() / 2])
/ 2.0
} else {
column_data[column_data.len() / 2]
};
let q1 = column_data[column_data.len() / 4];
let q3 = column_data[3 * column_data.len() / 4];
let iqr = q3 - q1;
let mut column = normalized.column_mut(j);
if iqr > 1e-8 {
column.mapv_inplace(|x| (x - median) / iqr);
}
}
}
NormalizationType::Quantum => {
for j in 0..data.ncols() {
let mut column = normalized.column_mut(j);
let norm = column.dot(&column).sqrt();
if norm > 1e-8 {
column.mapv_inplace(|x| x / norm);
}
}
}
}
Ok(normalized)
}
fn fit_feature_selector(&self, data: &Array2<f64>) -> Result<FeatureSelector> {
let n_features = data.ncols();
let feature_scores = match &self.config.feature_selection {
Some(FeatureSelection::Variance) => self.compute_variance_scores(data),
Some(FeatureSelection::Correlation) => self.compute_correlation_scores(data),
Some(FeatureSelection::MutualInformation) => {
self.compute_mutual_information_scores(data)
}
Some(FeatureSelection::QuantumInformation) => {
self.compute_quantum_information_scores(data)
}
None => Array1::zeros(n_features),
};
let mut indexed_scores: Vec<(usize, f64)> = feature_scores
.iter()
.enumerate()
.map(|(i, &score)| (i, score))
.collect();
indexed_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let num_selected = (n_features / 2).max(1);
let selected_features: Vec<usize> = indexed_scores
.into_iter()
.take(num_selected)
.map(|(idx, _)| idx)
.collect();
Ok(FeatureSelector {
selected_features,
feature_scores,
})
}
fn apply_feature_selection(
&self,
data: &Array2<f64>,
selector: &FeatureSelector,
) -> Result<Array2<f64>> {
let selected_data = data.select(Axis(1), &selector.selected_features);
Ok(selected_data)
}
fn fit_dimensionality_reducer(&self, data: &Array2<f64>) -> Result<DimensionalityReducer> {
let n_features = data.ncols();
let target_dim = (n_features / 2).max(1);
match &self.config.dimensionality_reduction {
Some(DimensionalityReduction::PCA) => self.fit_pca(data, target_dim),
Some(DimensionalityReduction::ICA) => self.fit_ica(data, target_dim),
Some(DimensionalityReduction::UMAP) => self.fit_umap(data, target_dim),
Some(DimensionalityReduction::QuantumPCA) => self.fit_quantum_pca(data, target_dim),
Some(DimensionalityReduction::QuantumManifold) => {
self.fit_quantum_manifold(data, target_dim)
}
None => {
let components = Array2::eye(n_features);
let explained_variance = Array1::ones(n_features);
Ok(DimensionalityReducer {
components,
explained_variance,
target_dim: n_features,
})
}
}
}
fn apply_dimensionality_reduction(
&self,
data: &Array2<f64>,
reducer: &DimensionalityReducer,
) -> Result<Array2<f64>> {
let reduced = data.dot(&reducer.components.t());
Ok(reduced)
}
fn compute_variance_scores(&self, data: &Array2<f64>) -> Array1<f64> {
let n_features = data.ncols();
let mut scores = Array1::zeros(n_features);
for j in 0..n_features {
let column = data.column(j);
scores[j] = column.var(0.0);
}
scores
}
fn compute_correlation_scores(&self, data: &Array2<f64>) -> Array1<f64> {
let n_features = data.ncols();
Array1::from_vec(
(0..n_features)
.map(|_| thread_rng().random::<f64>())
.collect(),
)
}
fn compute_mutual_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
let n_features = data.ncols();
Array1::from_vec(
(0..n_features)
.map(|_| thread_rng().random::<f64>())
.collect(),
)
}
fn compute_quantum_information_scores(&self, data: &Array2<f64>) -> Array1<f64> {
let n_features = data.ncols();
Array1::from_vec(
(0..n_features)
.map(|_| thread_rng().random::<f64>())
.collect(),
)
}
fn fit_pca(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
let n_features = data.ncols();
let components =
Array2::from_shape_fn(
(target_dim, n_features),
|(i, j)| {
if i == j {
1.0
} else {
0.0
}
},
);
let explained_variance =
Array1::from_vec((0..target_dim).map(|i| 1.0 / (i + 1) as f64).collect());
Ok(DimensionalityReducer {
components,
explained_variance,
target_dim,
})
}
fn fit_ica(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
self.fit_pca(data, target_dim)
}
fn fit_umap(&self, data: &Array2<f64>, target_dim: usize) -> Result<DimensionalityReducer> {
self.fit_pca(data, target_dim)
}
fn fit_quantum_pca(
&self,
data: &Array2<f64>,
target_dim: usize,
) -> Result<DimensionalityReducer> {
self.fit_pca(data, target_dim)
}
fn fit_quantum_manifold(
&self,
data: &Array2<f64>,
target_dim: usize,
) -> Result<DimensionalityReducer> {
self.fit_pca(data, target_dim)
}
}