Skip to main content

torsh_cluster/utils/
preprocessing.rs

1//! Data preprocessing utilities for clustering
2
3use crate::error::{ClusterError, ClusterResult};
4use torsh_tensor::Tensor;
5
6/// Data preprocessing methods
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum PreprocessingMethod {
9    /// Standardize features to zero mean and unit variance
10    StandardScaler,
11    /// Scale features to [0, 1] range
12    MinMaxScaler,
13    /// Normalize to unit norm
14    Normalizer,
15    /// Robust scaling using median and IQR
16    RobustScaler,
17}
18
19/// Standardize features to have zero mean and unit variance
20pub fn standardize_features(data: &Tensor) -> ClusterResult<Tensor> {
21    let n_samples = data.shape().dims()[0];
22    let n_features = data.shape().dims()[1];
23    let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
24
25    let mut standardized_data = Vec::with_capacity(data_vec.len());
26
27    // Compute mean and std for each feature
28    for j in 0..n_features {
29        let mut feature_values = Vec::with_capacity(n_samples);
30        for i in 0..n_samples {
31            feature_values.push(data_vec[i * n_features + j] as f64);
32        }
33
34        // Compute mean
35        let mean = feature_values.iter().sum::<f64>() / n_samples as f64;
36
37        // Compute standard deviation
38        let variance = feature_values
39            .iter()
40            .map(|&x| (x - mean).powi(2))
41            .sum::<f64>()
42            / n_samples as f64;
43        let std = variance.sqrt();
44
45        // Standardize feature values
46        for &feature_val in feature_values.iter() {
47            let standardized_val = if std > 0.0 {
48                (feature_val - mean) / std
49            } else {
50                0.0
51            };
52            standardized_data.push(standardized_val as f32);
53        }
54    }
55
56    // Reorder data back to row-major format
57    let mut reordered_data = vec![0.0; n_samples * n_features];
58    for i in 0..n_samples {
59        for j in 0..n_features {
60            reordered_data[i * n_features + j] = standardized_data[j * n_samples + i];
61        }
62    }
63
64    Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
65}
66
67/// Normalize features to [0, 1] range
68pub fn normalize_features(data: &Tensor) -> ClusterResult<Tensor> {
69    let n_samples = data.shape().dims()[0];
70    let n_features = data.shape().dims()[1];
71    let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
72
73    let mut normalized_data = Vec::with_capacity(data_vec.len());
74
75    // Compute min and max for each feature
76    for j in 0..n_features {
77        let mut feature_values = Vec::with_capacity(n_samples);
78        for i in 0..n_samples {
79            feature_values.push(data_vec[i * n_features + j] as f64);
80        }
81
82        let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
83        let max_val = feature_values
84            .iter()
85            .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
86        let range = max_val - min_val;
87
88        // Normalize feature values
89        for &feature_val in feature_values.iter() {
90            let normalized_val = if range > 0.0 {
91                (feature_val - min_val) / range
92            } else {
93                0.0
94            };
95            normalized_data.push(normalized_val as f32);
96        }
97    }
98
99    // Reorder data back to row-major format
100    let mut reordered_data = vec![0.0; n_samples * n_features];
101    for i in 0..n_samples {
102        for j in 0..n_features {
103            reordered_data[i * n_features + j] = normalized_data[j * n_samples + i];
104        }
105    }
106
107    Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
108}
109
110/// Normalize each sample to unit norm
111pub fn unit_normalize(data: &Tensor) -> ClusterResult<Tensor> {
112    let n_samples = data.shape().dims()[0];
113    let n_features = data.shape().dims()[1];
114    let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
115
116    let mut normalized_data = Vec::with_capacity(data_vec.len());
117
118    for i in 0..n_samples {
119        // Compute L2 norm for this sample
120        let mut norm = 0.0;
121        for j in 0..n_features {
122            let val = data_vec[i * n_features + j] as f64;
123            norm += val * val;
124        }
125        norm = norm.sqrt();
126
127        // Normalize the sample
128        for j in 0..n_features {
129            let normalized_val = if norm > 0.0 {
130                data_vec[i * n_features + j] as f64 / norm
131            } else {
132                0.0
133            };
134            normalized_data.push(normalized_val as f32);
135        }
136    }
137
138    Tensor::from_vec(normalized_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
139}
140
141/// Apply robust scaling using median and interquartile range
142pub fn robust_scale(data: &Tensor) -> ClusterResult<Tensor> {
143    let n_samples = data.shape().dims()[0];
144    let n_features = data.shape().dims()[1];
145    let data_vec = data.to_vec().map_err(ClusterError::TensorError)?;
146
147    let mut scaled_data = Vec::with_capacity(data_vec.len());
148
149    // Compute median and IQR for each feature
150    for j in 0..n_features {
151        let mut feature_values = Vec::with_capacity(n_samples);
152        for i in 0..n_samples {
153            feature_values.push(data_vec[i * n_features + j] as f64);
154        }
155
156        // Sort to compute quantiles
157        feature_values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
158
159        let median = if n_samples % 2 == 0 {
160            (feature_values[n_samples / 2 - 1] + feature_values[n_samples / 2]) / 2.0
161        } else {
162            feature_values[n_samples / 2]
163        };
164
165        let q1_idx = n_samples / 4;
166        let q3_idx = 3 * n_samples / 4;
167        let iqr = feature_values[q3_idx] - feature_values[q1_idx];
168
169        // Scale feature values
170        for i in 0..n_samples {
171            let original_val = data_vec[i * n_features + j] as f64;
172            let scaled_val = if iqr > 0.0 {
173                (original_val - median) / iqr
174            } else {
175                0.0
176            };
177            scaled_data.push(scaled_val as f32);
178        }
179    }
180
181    // Reorder data back to row-major format
182    let mut reordered_data = vec![0.0; n_samples * n_features];
183    for i in 0..n_samples {
184        for j in 0..n_features {
185            reordered_data[i * n_features + j] = scaled_data[j * n_samples + i];
186        }
187    }
188
189    Tensor::from_vec(reordered_data, &[n_samples, n_features]).map_err(ClusterError::TensorError)
190}