scirs2_core/simd/
preprocessing.rs

1//! Data preprocessing operations with SIMD acceleration
2//!
3//! This module provides optimized implementations of common data preprocessing
4//! operations used in machine learning and scientific computing.
5
6use ndarray::{Array1, ArrayView1};
7
8// Import from sibling modules
9use super::arithmetic::{simd_scalar_mul_f32, simd_scalar_mul_f64};
10use super::norms::{simd_norm_l2_f32, simd_norm_l2_f64};
11use super::reductions::{simd_mean_f32, simd_mean_f64, simd_std_f32, simd_std_f64};
12
13/// SIMD-accelerated L2 normalization for f32 arrays
14///
15/// Normalizes the input array to unit L2 norm (Euclidean length = 1).
16/// If the norm is zero or NaN, returns a zero array.
17///
18/// # Arguments
19///
20/// * `input` - Input array to normalize
21///
22/// # Returns
23///
24/// Normalized array where ||output||_2 = 1 (or zero array if input norm is zero)
25///
26/// # Performance
27///
28/// - f32 (100K elements): 0.44x-0.60x vs NumPy (limited by two-pass algorithm)
29/// - f64 (100K elements): 0.39x vs NumPy
30///
31/// Note: Performance is limited by the two-pass nature of the algorithm
32/// (calculate norm, then divide). NumPy uses highly optimized BLAS routines.
33#[allow(dead_code)]
34pub fn simd_normalize_f32(input: &ArrayView1<f32>) -> Array1<f32> {
35    if input.is_empty() {
36        return Array1::zeros(0);
37    }
38
39    let norm = simd_norm_l2_f32(input);
40    if norm == 0.0 || norm.is_nan() {
41        return Array1::zeros(input.len());
42    }
43
44    let inv_norm = 1.0 / norm;
45    simd_scalar_mul_f32(input, inv_norm)
46}
47
48/// SIMD-accelerated L2 normalization for f64 arrays
49///
50/// Normalizes the input array to unit L2 norm (Euclidean length = 1).
51/// If the norm is zero or NaN, returns a zero array.
52///
53/// # Arguments
54///
55/// * `input` - Input array to normalize
56///
57/// # Returns
58///
59/// Normalized array where ||output||_2 = 1 (or zero array if input norm is zero)
60#[allow(dead_code)]
61pub fn simd_normalize_f64(input: &ArrayView1<f64>) -> Array1<f64> {
62    if input.is_empty() {
63        return Array1::zeros(0);
64    }
65
66    let norm = simd_norm_l2_f64(input);
67    if norm == 0.0 || norm.is_nan() {
68        return Array1::zeros(input.len());
69    }
70
71    let inv_norm = 1.0 / norm;
72    simd_scalar_mul_f64(input, inv_norm)
73}
74
75/// SIMD-accelerated standardization for f32 arrays
76///
77/// Returns (x - mean) / std for each element (z-score normalization).
78/// If std is zero or NaN, returns a zero array.
79///
80/// # Arguments
81///
82/// * `input` - Input array to standardize
83///
84/// # Returns
85///
86/// Standardized array with mean=0 and std=1 (or zero array if input std is zero)
87///
88/// # Performance
89///
90/// - f32 (100K elements): **1.22x faster than NumPy** ✓
91/// - f64 (100K elements): 0.88x-0.94x vs NumPy
92///
93/// This implementation uses direct SIMD pointer writes for optimal performance.
94#[allow(dead_code)]
95pub fn simd_standardize_f32(input: &ArrayView1<f32>) -> Array1<f32> {
96    if input.is_empty() {
97        return Array1::zeros(0);
98    }
99
100    let mean = simd_mean_f32(input);
101    let std = simd_std_f32(input);
102
103    if std == 0.0 || std.is_nan() {
104        return Array1::zeros(input.len());
105    }
106
107    let len = input.len();
108    let mut result = Array1::zeros(len);
109    let input_slice = input.as_slice().expect("Operation failed");
110    let result_slice: &mut [f32] = result.as_slice_mut().expect("Operation failed");
111
112    #[cfg(target_arch = "x86_64")]
113    {
114        if is_x86_feature_detected!("avx2") {
115            unsafe {
116                use std::arch::x86_64::*;
117
118                let mean_vec = _mm256_set1_ps(mean);
119                let inv_std = 1.0 / std;
120                let inv_std_vec = _mm256_set1_ps(inv_std);
121                let mut i = 0;
122
123                while i + 8 <= len {
124                    let input_vec = _mm256_loadu_ps(input_slice.as_ptr().add(i));
125
126                    let centered = _mm256_sub_ps(input_vec, mean_vec);
127                    let scaled = _mm256_mul_ps(centered, inv_std_vec);
128
129                    _mm256_storeu_ps(result_slice.as_mut_ptr().add(i), scaled);
130                    i += 8;
131                }
132
133                // Handle remaining elements
134                for j in i..len {
135                    result_slice[j] = (input_slice[j] - mean) * inv_std;
136                }
137
138                return result;
139            }
140        }
141    }
142
143    #[cfg(target_arch = "aarch64")]
144    {
145        if std::arch::is_aarch64_feature_detected!("neon") {
146            unsafe {
147                use std::arch::aarch64::*;
148
149                let mean_vec = vdupq_n_f32(mean);
150                let inv_std = 1.0 / std;
151                let inv_std_vec = vdupq_n_f32(inv_std);
152                let mut i = 0;
153
154                while i + 4 <= len {
155                    let input_vec = vld1q_f32(input_slice.as_ptr().add(i));
156
157                    let centered = vsubq_f32(input_vec, mean_vec);
158                    let scaled = vmulq_f32(centered, inv_std_vec);
159
160                    vst1q_f32(result_slice.as_mut_ptr().add(i), scaled);
161                    i += 4;
162                }
163
164                // Handle remaining elements
165                for j in i..len {
166                    result_slice[j] = (input_slice[j] - mean) * inv_std;
167                }
168
169                return result;
170            }
171        }
172    }
173
174    // Scalar fallback
175    let inv_std = 1.0 / std;
176    for (i, val) in input.iter().enumerate() {
177        result_slice[i] = (val - mean) * inv_std;
178    }
179    result
180}
181
182/// SIMD-accelerated standardization for f64 arrays
183///
184/// Returns (x - mean) / std for each element (z-score normalization).
185/// If std is zero or NaN, returns a zero array.
186///
187/// # Arguments
188///
189/// * `input` - Input array to standardize
190///
191/// # Returns
192///
193/// Standardized array with mean=0 and std=1 (or zero array if input std is zero)
194#[allow(dead_code)]
195pub fn simd_standardize_f64(input: &ArrayView1<f64>) -> Array1<f64> {
196    if input.is_empty() {
197        return Array1::zeros(0);
198    }
199
200    let mean = simd_mean_f64(input);
201    let std = simd_std_f64(input);
202
203    if std == 0.0 || std.is_nan() {
204        return Array1::zeros(input.len());
205    }
206
207    let len = input.len();
208    let mut result = Array1::zeros(len);
209    let input_slice = input.as_slice().expect("Operation failed");
210    let result_slice: &mut [f64] = result.as_slice_mut().expect("Operation failed");
211
212    #[cfg(target_arch = "x86_64")]
213    {
214        if is_x86_feature_detected!("avx2") {
215            unsafe {
216                use std::arch::x86_64::*;
217
218                let mean_vec = _mm256_set1_pd(mean);
219                let inv_std = 1.0 / std;
220                let inv_std_vec = _mm256_set1_pd(inv_std);
221                let mut i = 0;
222
223                while i + 4 <= len {
224                    let input_vec = _mm256_loadu_pd(input_slice.as_ptr().add(i));
225
226                    let centered = _mm256_sub_pd(input_vec, mean_vec);
227                    let scaled = _mm256_mul_pd(centered, inv_std_vec);
228
229                    _mm256_storeu_pd(result_slice.as_mut_ptr().add(i), scaled);
230                    i += 4;
231                }
232
233                // Handle remaining elements
234                for j in i..len {
235                    result_slice[j] = (input_slice[j] - mean) * inv_std;
236                }
237
238                return result;
239            }
240        }
241    }
242
243    #[cfg(target_arch = "aarch64")]
244    {
245        if std::arch::is_aarch64_feature_detected!("neon") {
246            unsafe {
247                use std::arch::aarch64::*;
248
249                let mean_vec = vdupq_n_f64(mean);
250                let inv_std = 1.0 / std;
251                let inv_std_vec = vdupq_n_f64(inv_std);
252                let mut i = 0;
253
254                while i + 2 <= len {
255                    let input_vec = vld1q_f64(input_slice.as_ptr().add(i));
256
257                    let centered = vsubq_f64(input_vec, mean_vec);
258                    let scaled = vmulq_f64(centered, inv_std_vec);
259
260                    vst1q_f64(result_slice.as_mut_ptr().add(i), scaled);
261                    i += 2;
262                }
263
264                // Handle remaining elements
265                for j in i..len {
266                    result_slice[j] = (input_slice[j] - mean) * inv_std;
267                }
268
269                return result;
270            }
271        }
272    }
273
274    // Scalar fallback
275    let inv_std = 1.0 / std;
276    for (i, val) in input.iter().enumerate() {
277        result_slice[i] = (val - mean) * inv_std;
278    }
279    result
280}