scirs2-neural 0.3.3

Neural network building blocks module for SciRS2 (scirs2-neural) - Minimal Version
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
//! SIMD-accelerated operations for neural networks
//!
//! This module provides vectorized implementations of common neural network operations
//! using unified SIMD operations from scirs2-core for significant performance improvements
//! and cross-platform compatibility. All functions are feature-gated with "simd" feature.
//! IMPORTANT: All SIMD operations are delegated to scirs2-core's unified SIMD abstraction layer
//! in compliance with the project-wide SIMD policy. Direct use of SIMD intrinsics is FORBIDDEN.

use crate::error::{NeuralError, Result};
use scirs2_core::ndarray::{ArrayD, ArrayView, ArrayViewMut, IxDyn};
#[allow(unused_imports)]
use scirs2_core::numeric::{Float, FromPrimitive, Zero};
#[cfg(feature = "simd")]
use scirs2_core::ndarray::Array;
use scirs2_core::simd_ops::{AutoOptimizer, PlatformCapabilities, SimdUnifiedOps};
/// SIMD-accelerated operations for neural networks
pub struct SIMDOperations;
impl SIMDOperations {
    /// SIMD-accelerated ReLU activation for f32 arrays
    pub fn simd_relu_f32_inplace(input: &mut ArrayViewMut<f32, IxDyn>) {
        if let Some(slice) = input.as_slice_mut() {
            // Use unified SIMD operations for ReLU
            let caps = PlatformCapabilities::detect();
            if caps.simd_available && f32::simd_available() {
                Self::simd_relu_f32_slice_unified(slice);
            } else {
                Self::simd_relu_f32_slice_fallback(slice);
            }
        } else {
            input.mapv_inplace(|x| x.max(0.0));
        }
    }
    /// SIMD-accelerated ReLU for f32 slice using unified operations
    fn simd_relu_f32_slice_unified(slice: &mut [f32]) {
        // Convert _slice to Array1 for SIMD operations
        let arr = Array1::from_vec(_slice.to_vec());
        let zero_arr = Array1::zeros(arr.len());
        // Use unified SIMD max operation
        let result = f32::simd_max(&arr.view(), &zero_arr.view());
        // Copy result back to _slice
        for (i, &val) in result.iter().enumerate() {
            slice[i] = val;
    /// Fallback ReLU implementation
    fn simd_relu_f32_slice_fallback(slice: &mut [f32]) {
        for val in slice.iter_mut() {
            *val = val.max(0.0);
    /// Vectorized ReLU activation returning new array
    pub fn simd_relu_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        let mut result = input.to_owned();
        Self::simd_relu_f32_inplace(&mut result.view_mut());
        result
    /// SIMD-accelerated sigmoid activation for f32 arrays
    pub fn simd_sigmoid_f32_inplace(input: &mut ArrayViewMut<f32, IxDyn>) {
                Self::simd_sigmoid_f32_slice_unified(_slice);
                Self::simd_sigmoid_f32_slice_fallback(_slice);
            input.mapv_inplace(|x| 1.0 / (1.0 + (-x).exp()));
    /// SIMD-accelerated sigmoid for f32 _slice using unified operations
    fn simd_sigmoid_f32_slice_unified(slice: &mut [f32]) {
        // For sigmoid, we need to compute 1 / (1 + exp(-x))
        // Using core SIMD operations
        // Compute -x
        let neg_arr = Array1::from_elem(arr.len(), -1.0f32);
        let neg_x = f32::simd_mul(&arr.view(), &neg_arr.view());
        // Apply sigmoid element-wise (no direct SIMD exp available)
        let result: Array1<f32> = neg_x.mapv(|x| 1.0 / (1.0 + x.exp()));
    /// Fallback sigmoid implementation
    fn simd_sigmoid_f32_slice_fallback(slice: &mut [f32]) {
            *val = 1.0 / (1.0 + (-*val).exp());
    /// Vectorized sigmoid activation returning new array
    pub fn simd_sigmoid_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        Self::simd_sigmoid_f32_inplace(&mut result.view_mut());
    /// SIMD-accelerated tanh activation for f32 arrays
    pub fn simd_tanh_f32_inplace(input: &mut ArrayViewMut<f32, IxDyn>) {
                Self::simd_tanh_f32_slice_unified(_slice);
                Self::simd_tanh_f32_slice_fallback(_slice);
            input.mapv_inplace(|x| x.tanh());
    /// SIMD-accelerated tanh for f32 _slice using unified operations
    fn simd_tanh_f32_slice_unified(slice: &mut [f32]) {
        // For tanh, we apply element-wise since no direct SIMD tanh
        let result = arr.mapv(|x| x.tanh());
    /// Fallback tanh implementation
    fn simd_tanh_f32_slice_fallback(slice: &mut [f32]) {
            *val = val.tanh();
    /// Vectorized tanh activation returning new array
    pub fn simd_tanh_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        Self::simd_tanh_f32_inplace(&mut result.view_mut());
    /// SIMD-accelerated GELU activation for f32 arrays
    pub fn simd_gelu_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        if let Some(_slice) = result.as_slice_mut() {
                Self::simd_gelu_f32_slice_unified(_slice);
                Self::simd_gelu_f32_slice_fallback(_slice);
            result.mapv_inplace(|x| {
                0.5 * x * (1.0 + (x * 0.797_884_6 * (1.0 + 0.044715 * x * x)).tanh())
            });
    /// SIMD-accelerated GELU for f32 slice using unified operations
    fn simd_gelu_f32_slice_unified(slice: &mut [f32]) {
        // GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/Ï€) * (x + 0.044715 * x^3)))
        // Using core SIMD for parts of the computation
        // Compute x^2 and x^3 using SIMD
        let x_sq = f32::simd_mul(&arr.view(), &arr.view());
        let x_cube = f32::simd_mul(&x_sq.view(), &arr.view());
        // Compute 0.044715 * x^3
        let coeff2_arr = Array1::from_elem(arr.len(), 0.044715f32);
        let term2 = f32::simd_mul(&coeff2_arr.view(), &x_cube.view());
        // Compute x + 0.044715 * x^3
        let inner1 = f32::simd_add(&arr.view(), &term2.view());
        // Compute sqrt(2/Ï€) * (x + 0.044715 * x^3)
        let coeff1_arr = Array1::from_elem(arr.len(), 0.797_884_6f32);
        let inner2 = f32::simd_mul(&coeff1_arr.view(), &inner1.view());
        // Apply tanh and complete GELU computation
        let tanh_vals = inner2.mapv(|x| x.tanh());
        let one_arr = Array1::from_elem(arr.len(), 1.0f32);
        let factor = f32::simd_add(&one_arr.view(), &tanh_vals.view());
        let half_arr = Array1::from_elem(arr.len(), 0.5f32);
        let temp = f32::simd_mul(&half_arr.view(), &arr.view());
        let result = f32::simd_mul(&temp.view(), &factor.view());
    /// Fallback GELU implementation
    fn simd_gelu_f32_slice_fallback(slice: &mut [f32]) {
            let x = *val;
            *val = 0.5 * x * (1.0 + (x * 0.797_884_6 * (1.0 + 0.044715 * x * x)).tanh());
    /// SIMD-accelerated Swish/SiLU activation for f32 arrays
    pub fn simd_swish_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
                Self::simd_swish_f32_slice_unified(_slice);
                Self::simd_swish_f32_slice_fallback(_slice);
            result.mapv_inplace(|x| x / (1.0 + (-x).exp()));
    /// SIMD-accelerated Swish for f32 _slice using unified operations
    fn simd_swish_f32_slice_unified(slice: &mut [f32]) {
        // Swish(x) = x * sigmoid(x) = x / (1 + exp(-x))
        // Compute sigmoid(x) = 1 / (1 + exp(-x))
        // Since we don't have direct SIMD exp, we apply element-wise
        let result = arr.mapv(|x| x / (1.0 + (-x).exp()));
    /// Fallback Swish implementation
    fn simd_swish_f32_slice_fallback(slice: &mut [f32]) {
            *val = x / (1.0 + (-x).exp());
    /// SIMD-accelerated softmax for f32 arrays
    pub fn simd_softmax_f32(input: &ArrayD<f32>, axis: Option<usize>) -> Result<ArrayD<f32>> {
        let axis = axis.unwrap_or(input.ndim() - 1);
        if axis >= input.ndim() {
            return Err(NeuralError::InvalidArchitecture(
                "Softmax axis out of bounds".to_string(),
            ));
        let mut result = Array::zeros(input.raw_dim());
        // Process along the specified axis
        for lane in input.axis_iter(scirs2_core::ndarray::Axis(axis)) {
            if let (Some(input_slice), Some(mut result_slice)) = (
                lane.as_slice(),
                result.axis_iter_mut(scirs2_core::ndarray::Axis(axis)).next(),
            ) {
                if let Some(result_slice_mut) = result_slice.as_slice_mut() {
                    Self::simd_softmax_f32_slice(input_slice, result_slice_mut);
                }
        Ok(result)
    /// SIMD softmax for f32 slice using unified operations
    fn simd_softmax_f32_slice(input: &[f32], result: &mut [f32]) {
        if input.is_empty() {
            return;
        // Find maximum for numerical stability
        let input_arr = Array1::from_vec(_input.to_vec());
        let max_val = input.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
        let max_arr = Array1::from_elem(_input.len(), max_val);
        // Compute x - max using SIMD
        let shifted = f32::simd_sub(&input_arr.view(), &max_arr.view());
        // Apply exp element-wise (no direct SIMD exp)
        let exp_arr = shifted.mapv(|x| x.exp());
        // Sum all exp values
        let sum = f32::simd_sum(&exp_arr.view());
        // Divide by sum using SIMD
        let sum_arr = Array1::from_elem(exp_arr.len(), sum);
        let softmax_result = f32::simd_div(&exp_arr.view(), &sum_arr.view());
        // Copy result
        for (i, &val) in softmax_result.iter().enumerate() {
            result[i] = val;
    /// SIMD-accelerated cross-entropy loss computation
    pub fn simd_cross_entropy_loss_f32(
        predictions: &ArrayView<f32, IxDyn>,
        targets: &ArrayView<f32, IxDyn>,
        epsilon: f32,
    ) -> Result<f32> {
        if predictions.shape() != targets.shape() {
            return Err(NeuralError::ComputationError(
                "Predictions and targets must have the same shape".to_string(),
        if let (Some(pred_slice), Some(target_slice)) = (predictions.as_slice(), targets.as_slice())
        {
            Ok(Self::simd_cross_entropy_f32_slices(
                pred_slice,
                target_slice,
                epsilon,
            ))
            // Fallback
            let mut loss = 0.0;
            for (&pred, &target) in predictions.iter().zip(targets.iter()) {
                let clamped_pred = pred.max(epsilon).min(1.0 - epsilon);
                loss -= target * clamped_pred.ln();
            Ok(loss / predictions.len() as f32)
    /// SIMD cross-entropy for f32 slices using unified operations
    fn simd_cross_entropy_f32_slices(predictions: &[f32], targets: &[f32], epsilon: f32) -> f32 {
        let len = predictions.len().min(targets.len());
        let pred_arr = Array1::from_vec(predictions[..len].to_vec());
        let target_arr = Array1::from_vec(targets[..len].to_vec());
        // Clamp predictions to [epsilon, 1-epsilon] using SIMD
        let eps_arr = Array1::from_elem(len, epsilon);
        let one_minus_eps_arr = Array1::from_elem(len, 1.0 - epsilon);
        let clamped_lower = f32::simd_max(&pred_arr.view(), &eps_arr.view());
        let clamped_pred = f32::simd_min(&clamped_lower.view(), &one_minus_eps_arr.view());
        // Apply ln element-wise (no direct SIMD ln)
        let log_pred = clamped_pred.mapv(|x| x.ln());
        // Compute -target * log(pred) using SIMD
        let neg_target_arr = f32::simd_scalar_mul(&target_arr.view(), -1.0);
        let loss_values = f32::simd_mul(&neg_target_arr.view(), &log_pred.view());
        // Sum all loss values
        let total_loss = f32::simd_sum(&loss_values.view());
        total_loss / len as f32
    /// SIMD-accelerated matrix multiplication
    pub fn simd_matmul_f32(
        a: &ArrayView<f32, IxDyn>,
        b: &ArrayView<f32, IxDyn>,
    ) -> Result<ArrayD<f32>> {
        if a.ndim() != 2 || b.ndim() != 2 {
                "SIMD matmul only supports 2D arrays".to_string(),
        let (m, k) = (a.shape()[0], a.shape()[1]);
        let (k2, n) = (b.shape()[0], b.shape()[1]);
        if k != k2 {
                "Matrix dimensions don't match for multiplication".to_string(),
        let mut result = Array::zeros((m, n));
        // Use SIMD for inner loop vectorization
        for i in 0..m {
            for j in 0..n {
                let mut sum = 0.0f32;
                // SIMD-accelerated dot product
                let a_row_view = a.slice(scirs2_core::ndarray::s![i, ..]);
                let a_row_slice = if a.is_standard_layout() {
                    a_row_view.as_slice()
                } else {
                    None
                };
                if let Some(a_row) = a_row_slice {
                    sum = Self::simd_dot_product_f32(a_row, &Self::extract_column(b, j));
                    // Fallback to standard computation
                    for l in 0..k {
                        sum += a[[i, l]] * b[[l, j]];
                    }
                result[[i, j]] = sum;
        Ok(result.into_dyn())
    /// Extract column from 2D array for SIMD operations
    fn extract_column(_matrix: &ArrayView<f32, IxDyn>, colidx: usize) -> Vec<f32> {
        let rows = matrix.shape()[0];
        let mut column = Vec::with_capacity(rows);
        for i in 0..rows {
            column.push(_matrix[[i, col_idx]]);
        column
    /// SIMD-accelerated dot product using unified operations
    fn simd_dot_product_f32(a: &[f32], b: &[f32]) -> f32 {
        let len = a.len().min(b.len());
        let a_arr = Array1::from_vec(a[..len].to_vec());
        let b_arr = Array1::from_vec(b[..len].to_vec());
        // Compute element-wise product using SIMD
        let product = f32::simd_mul(&a_arr.view(), &b_arr.view());
        // Sum all products using SIMD
        f32::simd_sum(&product.view())
    /// SIMD-accelerated element-wise addition
    pub fn simd_add_f32(
        if a.shape() != b.shape() {
                "Arrays must have the same shape for addition".to_string(),
        let mut result = Array::zeros(a.raw_dim());
        let caps = PlatformCapabilities::detect();
        if caps.simd_available && f32::simd_available() {
            if let (Some(a_slice), Some(b_slice), Some(result_slice)) =
                (a.as_slice(), b.as_slice(), result.as_slice_mut())
            {
                Self::simd_add_f32_slices(a_slice, b_slice, result_slice);
                // Fallback for non-contiguous arrays
                for ((a_val, b_val), result_val) in a.iter().zip(b.iter()).zip(result.iter_mut()) {
                    *result_val = a_val + b_val;
            // Fallback for non-SIMD
            for ((a_val, b_val), result_val) in a.iter().zip(b.iter()).zip(result.iter_mut()) {
                *result_val = a_val + b_val;
    /// SIMD element-wise addition for slices using unified operations
    fn simd_add_f32_slices(a: &[f32], b: &[f32], result: &mut [f32]) {
        let len = a.len().min(b.len()).min(result.len());
        // Add using SIMD
        let sum = f32::simd_add(&a_arr.view(), &b_arr.view());
        for (i, &val) in sum.iter().enumerate() {
    /// SIMD-accelerated convolution operation
    pub fn simd_conv2d_f32(
        input: &ArrayView<f32, IxDyn>,
        kernel: &ArrayView<f32, IxDyn>,
        bias: Option<&[f32]>,
        stride: (usize, usize),
        padding: (usize, usize),
        if input.ndim() != 4 || kernel.ndim() != 4 {
                "Input and kernel must be 4D arrays (batch, channels, height, width)".to_string(),
        let (batch_size, in_channels, in_height, in_width) = (
            input.shape()[0],
            input.shape()[1],
            input.shape()[2],
            input.shape()[3],
        );
        let (out_channels_, kernel_height, kernel_width) = (
            kernel.shape()[0],
            kernel.shape()[1],
            kernel.shape()[2],
            kernel.shape()[3],
        let out_height = (in_height + 2 * padding.0 - kernel_height) / stride.0 + 1;
        let out_width = (in_width + 2 * padding.1 - kernel_width) / stride.1 + 1;
        let mut output = Array::zeros((batch_size, out_channels, out_height, out_width));
        // SIMD-optimized convolution
        for batch in 0..batch_size {
            for out_ch in 0..out_channels {
                for out_h in 0..out_height {
                    for out_w in 0..out_width {
                        let mut sum = 0.0f32;
                        for in_ch in 0..in_channels {
                            for kh in 0..kernel_height {
                                for kw in 0..kernel_width {
                                    let in_h = out_h * stride.0 + kh;
                                    let in_w = out_w * stride.1 + kw;
                                    if in_h >= padding.0
                                        && in_w >= padding.1
                                        && in_h - padding.0 < in_height
                                        && in_w - padding.1 < in_width
                                    {
                                        let input_val = input
                                            [[batch, in_ch, in_h - padding.0, in_w - padding.1]];
                                        let kernel_val = kernel[[out_ch, in_ch, kh, kw]];
                                        sum += input_val * kernel_val;
                                    }
                                }
                            }
                        }
                        // Add bias if provided
                        if let Some(b) = bias {
                            sum += b[out_ch % b.len()];
                        output[[batch, out_ch, out_h, out_w]] = sum;
        Ok(output.into_dyn())
    /// SIMD-accelerated batch normalization
    pub fn simd_batch_norm_f32(
        gamma: Option<&[f32]>,
        beta: Option<&[f32]>,
        mean: &[f32],
        variance: &[f32],
        if input.ndim() < 2 {
                "Input must have at least 2 dimensions".to_string(),
        let channels = input.shape()[1];
        if mean.len() != channels || variance.len() != channels {
                "Mean and variance must have same length as input channels".to_string(),
        // Process each element
        for (idx, (&input_val, result_val)) in input.iter().zip(result.iter_mut()).enumerate() {
            // Calculate which channel this element belongs to
            let flat_idx = idx;
            let total_spatial = input.len() / (input.shape()[0] * channels);
            let _spatial_idx = flat_idx % total_spatial;
            let channel_batch_idx = flat_idx / total_spatial;
            let channel = channel_batch_idx % channels;
            // Normalize: (x - mean) / sqrt(var + epsilon)
            let normalized = (input_val - mean[channel]) / (variance[channel] + epsilon).sqrt();
            // Scale and shift: gamma * normalized + beta
            let mut output = normalized;
            if let Some(g) = gamma {
                output *= g[channel % g.len()];
            if let Some(b) = beta {
                output += b[channel % b.len()];
            *result_val = output;
}
// Provide no-op implementations when SIMD is not available
/// SIMD operations for neural network computations (fallback implementation)
#[cfg(not(feature = "simd"))]
    /// Apply ReLU activation in-place using SIMD operations
    pub fn simd_relu_f32_inplace(input: &mut ArrayViewMut<f32, IxDyn>) {
        // No-op fallback
    /// Apply ReLU activation using SIMD operations
    pub fn simd_relu_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        // Fallback to standard operation
        input.mapv(|x| x.max(0.0))
    /// Apply sigmoid activation in-place using SIMD operations
    pub fn simd_sigmoid_f32_inplace(input: &mut ArrayViewMut<f32, IxDyn>) {
    /// Apply sigmoid activation using SIMD operations
    pub fn simd_sigmoid_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        input.mapv(|x| 1.0 / (1.0 + (-x).exp()))
    /// Apply tanh activation in-place using SIMD operations
    pub fn simd_tanh_f32_inplace(input: &mut ArrayViewMut<f32, IxDyn>) {
    /// Apply tanh activation using SIMD operations
    pub fn simd_tanh_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        input.mapv(|x| x.tanh())
    /// Apply GELU activation using SIMD operations
    pub fn simd_gelu_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        input.mapv(|x| 0.5 * x * (1.0 + (x * 0.797_884_6 * (1.0 + 0.044715 * x * x)).tanh()))
    /// Apply Swish activation using SIMD operations
    pub fn simd_swish_f32(input: &ArrayView<f32, IxDyn>) -> ArrayD<f32> {
        input.mapv(|x| x / (1.0 + (-x).exp()))
    /// Apply softmax activation using SIMD operations
    pub fn simd_softmax_f32(_input: &ArrayD<f32>, axis: Option<usize>) -> Result<ArrayD<f32>> {
        Err(NeuralError::ComputationError(
            "SIMD softmax requires 'simd' feature".to_string(),
        ))
    /// Compute cross-entropy loss using SIMD operations
        _predictions: &ArrayView<f32, IxDyn>, _targets: &ArrayView<f32, IxDyn>, _epsilon: f32,
            "SIMD cross entropy requires 'simd' feature".to_string(),
    /// Perform matrix multiplication using SIMD operations
        _a: &ArrayView<f32, IxDyn>, _b: &ArrayView<f32, IxDyn>,
            "SIMD matmul requires 'simd' feature".to_string(),
    /// Perform element-wise addition using SIMD operations
            "SIMD add requires 'simd' feature".to_string(),
    /// Perform 2D convolution using SIMD operations
        _input: &ArrayView<f32, IxDyn>, _kernel: &ArrayView<f32, IxDyn>, _bias: Option<&[f32]>, _stride: (usize, usize), _padding: (usize, usize),
            "SIMD conv2d requires 'simd' feature".to_string(),
    /// Perform batch normalization using SIMD operations
        _gamma: Option<&[f32]>, _beta: Option<&[f32]>, _mean: &[f32], _variance: &[f32],
            "SIMD batch norm requires 'simd' feature".to_string(),