quantrs2_ml/utils/
preprocessing.rs

1//! Data preprocessing utilities for QML
2
3use crate::error::{MLError, Result};
4use scirs2_core::ndarray::{Array1, Array2, Axis};
5
6use super::*;
7/// Normalize features to zero mean and unit variance
8pub fn standardize(data: &Array2<f64>) -> Result<Array2<f64>> {
9    let mean = data
10        .mean_axis(Axis(0))
11        .ok_or_else(|| MLError::ComputationError("Failed to compute mean".to_string()))?;
12    let std = data.std_axis(Axis(0), 0.0);
13    let mut normalized = data.clone();
14    for i in 0..data.nrows() {
15        for j in 0..data.ncols() {
16            normalized[(i, j)] = (data[(i, j)] - mean[j]) / (std[j] + 1e-8);
17        }
18    }
19    Ok(normalized)
20}
21/// Min-max normalization to [0, 1] range
22pub fn min_max_normalize(data: &Array2<f64>) -> Result<Array2<f64>> {
23    let mut normalized = data.clone();
24    for j in 0..data.ncols() {
25        let column = data.column(j);
26        let min_val = column.iter().cloned().fold(f64::INFINITY, f64::min);
27        let max_val = column.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
28        let range = max_val - min_val;
29        if range > 1e-10 {
30            for i in 0..data.nrows() {
31                normalized[(i, j)] = (data[(i, j)] - min_val) / range;
32            }
33        }
34    }
35    Ok(normalized)
36}
37/// Robust scaling using median and IQR (Interquartile Range)
38/// More robust to outliers than standardization
39pub fn robust_scale(data: &Array2<f64>) -> Result<Array2<f64>> {
40    let mut normalized = data.clone();
41    for j in 0..data.ncols() {
42        let mut column_values: Vec<f64> = data.column(j).to_vec();
43        column_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
44        let n = column_values.len();
45        let median = if n % 2 == 0 {
46            (column_values[n / 2 - 1] + column_values[n / 2]) / 2.0
47        } else {
48            column_values[n / 2]
49        };
50        let q1_idx = n / 4;
51        let q3_idx = (3 * n) / 4;
52        let q1 = column_values[q1_idx];
53        let q3 = column_values[q3_idx];
54        let iqr = q3 - q1;
55        if iqr > 1e-10 {
56            for i in 0..data.nrows() {
57                normalized[(i, j)] = (data[(i, j)] - median) / iqr;
58            }
59        } else {
60            for i in 0..data.nrows() {
61                normalized[(i, j)] = data[(i, j)] - median;
62            }
63        }
64    }
65    Ok(normalized)
66}
67/// Quantile normalization - forces features to have the same distribution
68/// Useful when features should be on the same scale but have different distributions
69pub fn quantile_normalize(data: &Array2<f64>) -> Result<Array2<f64>> {
70    let nrows = data.nrows();
71    let ncols = data.ncols();
72    let mut normalized = Array2::zeros((nrows, ncols));
73    let mut sorted_columns: Vec<Vec<(usize, f64)>> = Vec::with_capacity(ncols);
74    for j in 0..ncols {
75        let mut indexed_values: Vec<(usize, f64)> = data
76            .column(j)
77            .iter()
78            .enumerate()
79            .map(|(i, &v)| (i, v))
80            .collect();
81        indexed_values.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
82        sorted_columns.push(indexed_values);
83    }
84    let mut rank_means = vec![0.0; nrows];
85    for i in 0..nrows {
86        let mut sum = 0.0;
87        for col in &sorted_columns {
88            sum += col[i].1;
89        }
90        rank_means[i] = sum / ncols as f64;
91    }
92    for j in 0..ncols {
93        for (rank, &(original_idx, _)) in sorted_columns[j].iter().enumerate() {
94            normalized[(original_idx, j)] = rank_means[rank];
95        }
96    }
97    Ok(normalized)
98}
99/// Max absolute scaling - scales features by maximum absolute value
100/// Useful for sparse data where centering would destroy sparsity
101pub fn max_abs_scale(data: &Array2<f64>) -> Result<Array2<f64>> {
102    let mut normalized = data.clone();
103    for j in 0..data.ncols() {
104        let max_abs = data
105            .column(j)
106            .iter()
107            .map(|x| x.abs())
108            .fold(0.0f64, f64::max);
109        if max_abs > 1e-10 {
110            for i in 0..data.nrows() {
111                normalized[(i, j)] = data[(i, j)] / max_abs;
112            }
113        }
114    }
115    Ok(normalized)
116}
117/// L1 normalization - normalize each sample (row) to unit L1 norm
118/// Each row will sum to 1, useful for probability-like features
119pub fn l1_normalize(data: &Array2<f64>) -> Result<Array2<f64>> {
120    let mut normalized = data.clone();
121    for i in 0..data.nrows() {
122        let l1_norm: f64 = data.row(i).iter().map(|x| x.abs()).sum();
123        if l1_norm > 1e-10 {
124            for j in 0..data.ncols() {
125                normalized[(i, j)] = data[(i, j)] / l1_norm;
126            }
127        }
128    }
129    Ok(normalized)
130}
131/// L2 normalization - normalize each sample (row) to unit L2 norm
132/// Each row will have length 1, useful for cosine similarity
133pub fn l2_normalize(data: &Array2<f64>) -> Result<Array2<f64>> {
134    let mut normalized = data.clone();
135    for i in 0..data.nrows() {
136        let l2_norm: f64 = data.row(i).iter().map(|x| x * x).sum::<f64>().sqrt();
137        if l2_norm > 1e-10 {
138            for j in 0..data.ncols() {
139                normalized[(i, j)] = data[(i, j)] / l2_norm;
140            }
141        }
142    }
143    Ok(normalized)
144}