trueno/vector/ops/activations/
mod.rs

1//! Activation functions for Vector<f32>
2//!
3//! This module provides neural network activation functions optimized with
4//! multi-backend SIMD support (Scalar, SSE2, AVX2, AVX-512, NEON, WASM SIMD).
5//!
6//! ## Activation Functions
7//!
8//! - [`softmax`](crate::Vector::softmax): Softmax normalization for classification
9//! - [`log_softmax`](crate::Vector::log_softmax): Numerically stable log-softmax
10//! - [`relu`](crate::Vector::relu): Rectified Linear Unit
11//! - [`sigmoid`](crate::Vector::sigmoid): Logistic sigmoid
12//! - [`leaky_relu`](crate::Vector::leaky_relu): Leaky ReLU with configurable slope
13//! - [`elu`](crate::Vector::elu): Exponential Linear Unit
14//! - [`gelu`](crate::Vector::gelu): Gaussian Error Linear Unit
15//! - [`swish`](crate::Vector::swish): Self-gated activation (SiLU)
16//! - [`hardswish`](crate::Vector::hardswish): Efficient hardware-friendly swish
17//! - [`mish`](crate::Vector::mish): Self-regularizing activation
18//! - [`selu`](crate::Vector::selu): Scaled Exponential Linear Unit
19
20mod advanced;
21
22#[cfg(test)]
23mod tests;
24
25use crate::backends::scalar::ScalarBackend;
26use crate::backends::VectorBackend;
27use crate::vector::Vector;
28use crate::{Backend, Result, TruenoError};
29
30/// Backend dispatch macro for unary operations - centralizes platform-specific SIMD dispatch
31/// to eliminate code duplication across activation functions.
32///
33/// # Safety
34/// The macro wraps unsafe backend calls internally, so callers don't need unsafe blocks.
35macro_rules! dispatch_unary_op {
36    ($backend:expr, $op:ident, $input:expr, $output:expr) => {{
37        #[cfg(target_arch = "x86_64")]
38        use crate::backends::{avx2::Avx2Backend, sse2::Sse2Backend};
39        // SAFETY: CPU features verified at runtime before backend selection
40        unsafe {
41            match $backend {
42                Backend::Scalar => ScalarBackend::$op($input, $output),
43                #[cfg(target_arch = "x86_64")]
44                Backend::SSE2 | Backend::AVX => Sse2Backend::$op($input, $output),
45                #[cfg(target_arch = "x86_64")]
46                Backend::AVX2 | Backend::AVX512 => Avx2Backend::$op($input, $output),
47                #[cfg(not(target_arch = "x86_64"))]
48                Backend::SSE2 | Backend::AVX | Backend::AVX2 | Backend::AVX512 => {
49                    ScalarBackend::$op($input, $output)
50                }
51                #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
52                Backend::NEON => {
53                    use crate::backends::neon::NeonBackend;
54                    NeonBackend::$op($input, $output)
55                }
56                #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
57                Backend::NEON => ScalarBackend::$op($input, $output),
58                #[cfg(target_arch = "wasm32")]
59                Backend::WasmSIMD => {
60                    use crate::backends::wasm::WasmBackend;
61                    WasmBackend::$op($input, $output)
62                }
63                #[cfg(not(target_arch = "wasm32"))]
64                Backend::WasmSIMD => ScalarBackend::$op($input, $output),
65                Backend::GPU | Backend::Auto => ScalarBackend::$op($input, $output),
66            }
67        }
68    }};
69}
70
71// Re-export macro for use in advanced submodule
72pub(crate) use dispatch_unary_op;
73
74impl Vector<f32> {
75    /// Softmax activation function
76    ///
77    /// Converts a vector of real values into a probability distribution.
78    /// Formula: softmax(x)\[i\] = exp(x\[i\] - max(x)) / sum(exp(x\[j\] - max(x)))
79    ///
80    /// Uses the numerically stable version with max subtraction to prevent overflow.
81    /// The output is a probability distribution: all values in [0, 1] and sum to 1.
82    ///
83    /// This is the standard activation function for multi-class classification in neural networks.
84    ///
85    /// # Examples
86    ///
87    /// ```
88    /// use trueno::Vector;
89    ///
90    /// let logits = Vector::from_slice(&[1.0, 2.0, 3.0]);
91    /// let probs = logits.softmax()?;
92    ///
93    /// // Verify sum ≈ 1
94    /// let sum: f32 = probs.as_slice().iter().sum();
95    /// assert!((sum - 1.0).abs() < 1e-5);
96    ///
97    /// // Verify all values in [0, 1]
98    /// for &p in probs.as_slice() {
99    ///     assert!(p >= 0.0 && p <= 1.0);
100    /// }
101    /// # Ok::<(), trueno::TruenoError>(())
102    /// ```
103    ///
104    /// # Empty vectors
105    ///
106    /// Returns EmptyVector error for empty vectors (cannot compute softmax).
107    pub fn softmax(&self) -> Result<Self> {
108        if self.data.is_empty() {
109            return Err(TruenoError::EmptyVector);
110        }
111
112        // OpComplexity::Medium - GPU threshold: >10K elements (multi-pass overhead)
113        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
114        const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 4-368x slower, see docs/performance-analysis.md
115
116        // Try GPU first for large vectors
117        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
118        {
119            if self.data.len() >= GPU_THRESHOLD {
120                use crate::backends::gpu::GpuDevice;
121                if GpuDevice::is_available() {
122                    let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
123                    let mut result = vec![0.0; self.data.len()];
124                    if gpu.softmax(&self.data, &mut result).is_ok() {
125                        return Ok(Vector::from_vec(result));
126                    }
127                }
128            }
129        }
130
131        // Scalar fallback: Multi-pass softmax for numerical stability
132        // Find max for numerical stability (prevents overflow in exp)
133        let max_val = self.max()?;
134
135        // Compute exp(x - max) for each element
136        let exp_vals: Vec<f32> = self.data.iter().map(|&x| (x - max_val).exp()).collect();
137
138        // Compute sum of exponentials
139        let sum_exp: f32 = exp_vals.iter().sum();
140
141        // Normalize by sum (guard against sum=0 from underflow)
142        let safe_sum = sum_exp.max(f32::EPSILON);
143        let data: Vec<f32> = exp_vals.iter().map(|&e| e / safe_sum).collect();
144
145        Ok(Vector::from_vec(data))
146    }
147
148    /// Log-softmax activation function
149    ///
150    /// Computes the logarithm of the softmax function in a numerically stable way.
151    /// Formula: log_softmax(x)\[i\] = x\[i\] - max(x) - log(sum(exp(x\[j\] - max(x))))
152    ///
153    /// This is more numerically stable than computing log(softmax(x)) and is commonly
154    /// used in neural networks for computing cross-entropy loss.
155    ///
156    /// # Examples
157    ///
158    /// ```
159    /// use trueno::Vector;
160    ///
161    /// let logits = Vector::from_slice(&[1.0, 2.0, 3.0]);
162    /// let log_probs = logits.log_softmax()?;
163    ///
164    /// // Verify exp(log_softmax) = softmax
165    /// let probs_from_log: Vec<f32> = log_probs.as_slice().iter().map(|&x| x.exp()).collect();
166    /// let sum: f32 = probs_from_log.iter().sum();
167    /// assert!((sum - 1.0).abs() < 1e-5);
168    /// # Ok::<(), trueno::TruenoError>(())
169    /// ```
170    ///
171    /// # Empty vectors
172    ///
173    /// Returns EmptyVector error for empty vectors.
174    pub fn log_softmax(&self) -> Result<Self> {
175        if self.data.is_empty() {
176            return Err(TruenoError::EmptyVector);
177        }
178
179        // OpComplexity::Medium - GPU threshold: >10K elements (multi-pass overhead)
180        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
181        const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 4-368x slower, see docs/performance-analysis.md
182
183        // Try GPU first for large vectors
184        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
185        {
186            if self.data.len() >= GPU_THRESHOLD {
187                use crate::backends::gpu::GpuDevice;
188                if GpuDevice::is_available() {
189                    let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
190                    let mut result = vec![0.0; self.data.len()];
191                    if gpu.log_softmax(&self.data, &mut result).is_ok() {
192                        return Ok(Vector::from_vec(result));
193                    }
194                }
195            }
196        }
197
198        // Scalar fallback: Multi-pass log_softmax for numerical stability
199        // Find max for numerical stability
200        let max_val = self.max()?;
201
202        // Compute exp(x - max) for each element
203        let exp_vals: Vec<f32> = self.data.iter().map(|&x| (x - max_val).exp()).collect();
204
205        // Compute log of sum of exponentials
206        let sum_exp: f32 = exp_vals.iter().sum();
207        let log_sum_exp = sum_exp.max(f32::EPSILON).ln();
208
209        // log_softmax(x)[i] = x[i] - max - log_sum_exp
210        let data: Vec<f32> = self.data.iter().map(|&x| x - max_val - log_sum_exp).collect();
211
212        Ok(Vector::from_vec(data))
213    }
214
215    /// ReLU (Rectified Linear Unit) activation function
216    ///
217    /// Computes the element-wise ReLU: max(0, x).
218    /// ReLU is one of the most widely used activation functions in neural networks.
219    ///
220    /// # Formula
221    ///
222    /// ```text
223    /// relu(x)[i] = max(0, x\[i\])
224    ///            = x\[i\]  if x\[i\] > 0
225    ///            = 0     otherwise
226    /// ```
227    ///
228    /// # Properties
229    ///
230    /// - **Non-linearity**: Introduces non-linearity while preserving linearity for positive values
231    /// - **Sparsity**: Produces exactly zero for negative inputs (sparse activations)
232    /// - **Gradient**: Derivative is 1 for positive inputs, 0 for negative (solves vanishing gradient)
233    /// - **Computational efficiency**: Simple max operation, no exponentials
234    ///
235    /// # Applications
236    ///
237    /// - **Deep neural networks**: Default activation for hidden layers
238    /// - **Convolutional networks**: Standard activation in CNNs
239    /// - **Feature learning**: Encourages sparse representations
240    ///
241    /// # Performance
242    ///
243    /// This operation is memory-bound. SIMD provides modest speedups since
244    /// the computation (comparison and selection) is simpler than memory access.
245    ///
246    /// # Errors
247    ///
248    /// Returns `EmptyVector` if the input vector is empty.
249    ///
250    /// # Examples
251    ///
252    /// ```
253    /// use trueno::Vector;
254    ///
255    /// let v = Vector::from_slice(&[-2.0, -1.0, 0.0, 1.0, 2.0]);
256    /// let result = v.relu()?;
257    /// assert_eq!(result.as_slice(), &[0.0, 0.0, 0.0, 1.0, 2.0]);
258    /// # Ok::<(), trueno::TruenoError>(())
259    /// ```
260    pub fn relu(&self) -> Result<Self> {
261        if self.data.is_empty() {
262            return Err(TruenoError::EmptyVector);
263        }
264
265        // OpComplexity::Low - GPU threshold: >100K elements
266        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
267        const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 2-800x slower, see docs/performance-analysis.md
268
269        // Try GPU first for large vectors
270        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
271        {
272            if self.data.len() >= GPU_THRESHOLD {
273                use crate::backends::gpu::GpuDevice;
274                if GpuDevice::is_available() {
275                    let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
276                    let mut result = vec![0.0; self.data.len()];
277                    if gpu.relu(&self.data, &mut result).is_ok() {
278                        return Ok(Vector::from_vec(result));
279                    }
280                }
281            }
282        }
283
284        // Uninit: dispatch_unary_op writes every element before any read.
285        let n = self.len();
286        let mut result: Vec<f32> = Vec::with_capacity(n);
287        // SAFETY: Backend activation writes all elements before any read.
288        unsafe {
289            result.set_len(n);
290        }
291
292        // Use parallel processing for very large arrays (reduces TLB pressure and improves cache utilization)
293        #[cfg(feature = "parallel")]
294        {
295            const PARALLEL_THRESHOLD: usize = 500_000; // Increased to avoid overhead at smaller sizes
296            const CHUNK_SIZE: usize = 65536; // 64K elements = 256KB, cache-friendly
297
298            if self.len() >= PARALLEL_THRESHOLD {
299                use rayon::prelude::*;
300
301                self.data.par_chunks(CHUNK_SIZE).zip(result.par_chunks_mut(CHUNK_SIZE)).for_each(
302                    |(chunk_in, chunk_out)| {
303                        dispatch_unary_op!(self.backend, relu, chunk_in, chunk_out);
304                    },
305                );
306
307                return Ok(Vector::from_vec(result)); // Use from_vec to avoid extra copy
308            }
309        }
310
311        // Sequential processing for small arrays or when parallel feature disabled
312        dispatch_unary_op!(self.backend, relu, &self.data, &mut result);
313
314        Ok(Vector::from_vec(result)) // Use from_vec to avoid extra copy
315    }
316
317    /// Sigmoid (logistic) activation function
318    ///
319    /// Computes the element-wise sigmoid: σ(x) = 1 / (1 + e^(-x)).
320    /// Sigmoid is a classic activation function that squashes inputs to the range (0, 1).
321    ///
322    /// # Formula
323    ///
324    /// ```text
325    /// sigmoid(x)[i] = 1 / (1 + exp(-x\[i\]))
326    ///               = exp(x\[i\]) / (1 + exp(x\[i\]))
327    /// ```
328    ///
329    /// # Properties
330    ///
331    /// - **Bounded output**: Maps all inputs to (0, 1) range
332    /// - **Smooth**: Infinitely differentiable (C^∞)
333    /// - **Symmetric**: σ(-x) = 1 - σ(x)
334    /// - **Derivative**: σ'(x) = σ(x) * (1 - σ(x))
335    /// - **Interpretable**: Output can be interpreted as probability
336    ///
337    /// # Applications
338    ///
339    /// - **Binary classification**: Final layer for binary output (0 or 1)
340    /// - **Logistic regression**: Traditional ML algorithm
341    /// - **Gating mechanisms**: LSTM/GRU gates (input, forget, output)
342    /// - **Attention mechanisms**: Soft attention weights
343    ///
344    /// # Numerical Considerations
345    ///
346    /// For very large negative inputs (x < -50), exp(-x) overflows to infinity.
347    /// However, sigmoid(x) approaches 0, so we return 0 for numerical stability.
348    /// For very large positive inputs (x > 50), exp(-x) underflows to 0,
349    /// and sigmoid(x) approaches 1.
350    ///
351    /// # Performance
352    ///
353    /// This operation is compute-bound due to the exp() operation. SIMD provides
354    /// modest speedups, but the exponential is the bottleneck.
355    ///
356    /// # Errors
357    ///
358    /// Returns `EmptyVector` if the input vector is empty.
359    ///
360    /// # Examples
361    ///
362    /// ```
363    /// use trueno::Vector;
364    ///
365    /// let v = Vector::from_slice(&[-2.0, 0.0, 2.0]);
366    /// let result = v.sigmoid()?;
367    ///
368    /// // sigmoid(-2) ≈ 0.119, sigmoid(0) = 0.5, sigmoid(2) ≈ 0.881
369    /// assert!((result.as_slice()[0] - 0.119).abs() < 0.001);
370    /// assert!((result.as_slice()[1] - 0.5).abs() < 0.001);
371    /// assert!((result.as_slice()[2] - 0.881).abs() < 0.001);
372    /// # Ok::<(), trueno::TruenoError>(())
373    /// ```
374    pub fn sigmoid(&self) -> Result<Self> {
375        if self.data.is_empty() {
376            return Err(TruenoError::EmptyVector);
377        }
378
379        // OpComplexity::Low - GPU threshold: >100K elements
380        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
381        const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 2-800x slower, see docs/performance-analysis.md
382
383        // Try GPU first for large vectors
384        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
385        {
386            if self.data.len() >= GPU_THRESHOLD {
387                use crate::backends::gpu::GpuDevice;
388                if GpuDevice::is_available() {
389                    let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
390                    let mut result = vec![0.0; self.data.len()];
391                    if gpu.sigmoid(&self.data, &mut result).is_ok() {
392                        return Ok(Vector::from_vec(result));
393                    }
394                }
395            }
396        }
397
398        // Uninit: dispatch_unary_op writes every element before any read.
399        let n = self.len();
400        let mut result: Vec<f32> = Vec::with_capacity(n);
401        // SAFETY: Backend activation writes all elements before any read.
402        unsafe {
403            result.set_len(n);
404        }
405
406        // Dispatch to appropriate backend
407        dispatch_unary_op!(self.backend, sigmoid, &self.data, &mut result);
408
409        Ok(Vector::from_vec(result))
410    }
411}
trueno/vector/ops/activations/mod.rs

trueno/vector/ops/activations/
mod.rs