sklears_preprocessing/outlier_detection/
simd_operations.rs1pub fn simd_zscore(data: &[f64], mean: f64, std: f64) -> Vec<f64> {
9 let mut result = vec![0.0; data.len()];
29 let inv_std = 1.0 / std;
30 for (i, &val) in data.iter().enumerate() {
31 result[i] = (val - mean) * inv_std;
32 }
33
34 result
35}
36
37pub fn simd_modified_zscore(data: &[f64], median: f64, mad: f64) -> Vec<f64> {
40 let mut result = vec![0.0; data.len()];
43 let mad_scale = 0.6745; let inv_mad = mad_scale / mad.max(1e-10);
45
46 for (i, &val) in data.iter().enumerate() {
47 result[i] = (val - median) * inv_mad;
48 }
49
50 result
51}
52
53pub fn simd_mahalanobis_distance(
56 data: &[Vec<f64>],
57 mean: &[f64],
58 inv_cov: &[Vec<f64>],
59) -> Vec<f64> {
60 let n_samples = data.len();
63 let mut distances = vec![0.0; n_samples];
64
65 for i in 0..n_samples {
66 let sample = &data[i];
67 let mut centered = vec![0.0; sample.len()];
68
69 for j in 0..sample.len() {
71 centered[j] = sample[j] - mean[j];
72 }
73
74 let mut distance_squared = 0.0;
76 for j in 0..centered.len() {
77 let mut temp = 0.0;
78 for k in 0..centered.len() {
79 temp += inv_cov[j][k] * centered[k];
80 }
81 distance_squared += centered[j] * temp;
82 }
83
84 distances[i] = distance_squared.sqrt();
85 }
86
87 distances
88}
89
90pub fn simd_percentile_outliers(
93 data: &[f64],
94 lower_percentile: f64,
95 upper_percentile: f64,
96) -> (Vec<bool>, f64, f64) {
97 let mut sorted_data = data.to_vec();
99 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
100
101 let n = sorted_data.len();
102 let lower_idx = ((lower_percentile / 100.0) * (n - 1) as f64) as usize;
103 let upper_idx = ((upper_percentile / 100.0) * (n - 1) as f64) as usize;
104
105 let lower_bound = sorted_data[lower_idx];
106 let upper_bound = sorted_data[upper_idx];
107
108 let outliers: Vec<bool> = data
111 .iter()
112 .map(|&val| val < lower_bound || val > upper_bound)
113 .collect();
114
115 (outliers, lower_bound, upper_bound)
116}
117
118pub fn simd_iqr_outliers(data: &[f64], iqr_multiplier: f64) -> (Vec<bool>, f64, f64, f64, f64) {
121 let mut sorted_data = data.to_vec();
123 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
124
125 let n = sorted_data.len();
126 let q1_idx = (0.25 * (n - 1) as f64) as usize;
127 let q3_idx = (0.75 * (n - 1) as f64) as usize;
128
129 let q1 = sorted_data[q1_idx];
130 let q3 = sorted_data[q3_idx];
131 let iqr = q3 - q1;
132
133 let lower_bound = q1 - iqr_multiplier * iqr;
134 let upper_bound = q3 + iqr_multiplier * iqr;
135
136 let outliers: Vec<bool> = data
139 .iter()
140 .map(|&val| val < lower_bound || val > upper_bound)
141 .collect();
142
143 (outliers, lower_bound, upper_bound, q1, q3)
144}
145
146pub fn simd_ensemble_scoring(scores: &[Vec<f64>], weights: Option<&[f64]>) -> Vec<f64> {
149 if scores.is_empty() {
150 return vec![];
151 }
152
153 let n_samples = scores[0].len();
156 let n_methods = scores.len();
157 let mut result = vec![0.0; n_samples];
158
159 let uniform_weight = 1.0 / n_methods as f64;
160
161 for i in 0..n_samples {
162 let mut ensemble_score = 0.0;
163 for (method_idx, method_scores) in scores.iter().enumerate() {
164 let weight = if let Some(w) = weights {
165 w.get(method_idx).copied().unwrap_or(uniform_weight)
166 } else {
167 uniform_weight
168 };
169 ensemble_score += method_scores[i] * weight;
170 }
171 result[i] = ensemble_score;
172 }
173
174 result
175}
176
177pub fn simd_mean(data: &[f64]) -> f64 {
179 if data.is_empty() {
181 return 0.0;
182 }
183 data.iter().sum::<f64>() / data.len() as f64
184}
185
186pub fn simd_variance(data: &[f64], mean: f64) -> f64 {
188 if data.len() <= 1 {
190 return 0.0;
191 }
192 let sum_sq_diff: f64 = data.iter().map(|x| (x - mean).powi(2)).sum();
193 sum_sq_diff / (data.len() - 1) as f64
194}
195
196pub fn simd_matvec_multiply(matrix: &[Vec<f64>], vector: &[f64]) -> Vec<f64> {
198 let n_rows = matrix.len();
200 let mut result = vec![0.0; n_rows];
201
202 for (i, row) in matrix.iter().enumerate() {
203 let mut sum = 0.0;
204 for (j, &val) in row.iter().enumerate() {
205 if j < vector.len() {
206 sum += val * vector[j];
207 }
208 }
209 result[i] = sum;
210 }
211
212 result
213}
214
215pub fn simd_dot_product(a: &[f64], b: &[f64]) -> f64 {
217 a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
219}
220
221pub fn simd_euclidean_distance(diff: &[f64]) -> f64 {
223 diff.iter().map(|x| x * x).sum::<f64>().sqrt()
225}
226
227#[allow(non_snake_case)]
228#[cfg(test)]
229mod tests {
230 use super::*;
231
232 #[test]
233 fn test_simd_zscore() {
234 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
235 let mean = 3.0;
236 let std = (2.5f64).sqrt(); let z_scores = simd_zscore(&data, mean, std);
239
240 let z_mean = z_scores.iter().sum::<f64>() / z_scores.len() as f64;
242 assert!((z_mean).abs() < 1e-10);
243 }
244
245 #[test]
246 fn test_simd_percentile_outliers() {
247 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]; let (outliers, lower, upper) = simd_percentile_outliers(&data, 10.0, 90.0);
249
250 assert!(outliers[5]); assert!(upper < 100.0); assert!(lower > 0.0); }
255
256 #[test]
257 fn test_simd_iqr_outliers() {
258 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 100.0]; let (outliers, _lower, upper, q1, q3) = simd_iqr_outliers(&data, 1.5);
260
261 assert!(outliers[5]); assert!(q3 > q1); assert!(upper < 100.0); }
266
267 #[test]
268 fn test_simd_ensemble_scoring() {
269 let scores = vec![
270 vec![0.1, 0.2, 0.9], vec![0.2, 0.1, 0.8], ];
273 let weights_vec = vec![0.6, 0.4];
274 let weights = Some(weights_vec.as_slice());
275
276 let ensemble_scores = simd_ensemble_scoring(&scores, weights);
277
278 assert_eq!(ensemble_scores.len(), 3);
279 assert!(ensemble_scores[2] > ensemble_scores[0]);
281 assert!(ensemble_scores[2] > ensemble_scores[1]);
282 }
283
284 #[test]
285 fn test_simd_mahalanobis_distance() {
286 let data = vec![
287 vec![1.0, 2.0],
288 vec![2.0, 3.0],
289 vec![10.0, 10.0], ];
291 let mean = vec![1.5, 2.5];
292 let inv_cov = vec![vec![1.0, 0.0], vec![0.0, 1.0]];
293
294 let distances = simd_mahalanobis_distance(&data, &mean, &inv_cov);
295
296 assert_eq!(distances.len(), 3);
297 assert!(distances[2] > distances[0]);
299 assert!(distances[2] > distances[1]);
300 }
301
302 #[test]
303 fn test_simd_mean_variance() {
304 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
305 let mean = simd_mean(&data);
306 let variance = simd_variance(&data, mean);
307
308 assert!((mean - 3.0).abs() < 1e-10);
309 assert!(variance > 0.0);
310 }
311
312 #[test]
313 fn test_simd_dot_product() {
314 let a = vec![1.0, 2.0, 3.0];
315 let b = vec![4.0, 5.0, 6.0];
316 let result = simd_dot_product(&a, &b);
317
318 assert!((result - 32.0).abs() < 1e-10);
320 }
321
322 #[test]
323 fn test_simd_euclidean_distance() {
324 let diff = vec![3.0, 4.0]; let distance = simd_euclidean_distance(&diff);
326
327 assert!((distance - 5.0).abs() < 1e-10);
329 }
330}