trueno/vector/ops/activations/mod.rs
1//! Activation functions for Vector<f32>
2//!
3//! This module provides neural network activation functions optimized with
4//! multi-backend SIMD support (Scalar, SSE2, AVX2, AVX-512, NEON, WASM SIMD).
5//!
6//! ## Activation Functions
7//!
8//! - [`softmax`](crate::Vector::softmax): Softmax normalization for classification
9//! - [`log_softmax`](crate::Vector::log_softmax): Numerically stable log-softmax
10//! - [`relu`](crate::Vector::relu): Rectified Linear Unit
11//! - [`sigmoid`](crate::Vector::sigmoid): Logistic sigmoid
12//! - [`leaky_relu`](crate::Vector::leaky_relu): Leaky ReLU with configurable slope
13//! - [`elu`](crate::Vector::elu): Exponential Linear Unit
14//! - [`gelu`](crate::Vector::gelu): Gaussian Error Linear Unit
15//! - [`swish`](crate::Vector::swish): Self-gated activation (SiLU)
16//! - [`hardswish`](crate::Vector::hardswish): Efficient hardware-friendly swish
17//! - [`mish`](crate::Vector::mish): Self-regularizing activation
18//! - [`selu`](crate::Vector::selu): Scaled Exponential Linear Unit
19
20mod advanced;
21
22#[cfg(test)]
23mod tests;
24
25use crate::backends::scalar::ScalarBackend;
26use crate::backends::VectorBackend;
27use crate::vector::Vector;
28use crate::{Backend, Result, TruenoError};
29
30/// Backend dispatch macro for unary operations - centralizes platform-specific SIMD dispatch
31/// to eliminate code duplication across activation functions.
32///
33/// # Safety
34/// The macro wraps unsafe backend calls internally, so callers don't need unsafe blocks.
35macro_rules! dispatch_unary_op {
36 ($backend:expr, $op:ident, $input:expr, $output:expr) => {{
37 #[cfg(target_arch = "x86_64")]
38 use crate::backends::{avx2::Avx2Backend, sse2::Sse2Backend};
39 // SAFETY: CPU features verified at runtime before backend selection
40 unsafe {
41 match $backend {
42 Backend::Scalar => ScalarBackend::$op($input, $output),
43 #[cfg(target_arch = "x86_64")]
44 Backend::SSE2 | Backend::AVX => Sse2Backend::$op($input, $output),
45 #[cfg(target_arch = "x86_64")]
46 Backend::AVX2 | Backend::AVX512 => Avx2Backend::$op($input, $output),
47 #[cfg(not(target_arch = "x86_64"))]
48 Backend::SSE2 | Backend::AVX | Backend::AVX2 | Backend::AVX512 => {
49 ScalarBackend::$op($input, $output)
50 }
51 #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
52 Backend::NEON => {
53 use crate::backends::neon::NeonBackend;
54 NeonBackend::$op($input, $output)
55 }
56 #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
57 Backend::NEON => ScalarBackend::$op($input, $output),
58 #[cfg(target_arch = "wasm32")]
59 Backend::WasmSIMD => {
60 use crate::backends::wasm::WasmBackend;
61 WasmBackend::$op($input, $output)
62 }
63 #[cfg(not(target_arch = "wasm32"))]
64 Backend::WasmSIMD => ScalarBackend::$op($input, $output),
65 Backend::GPU | Backend::Auto => ScalarBackend::$op($input, $output),
66 }
67 }
68 }};
69}
70
71// Re-export macro for use in advanced submodule
72pub(crate) use dispatch_unary_op;
73
74impl Vector<f32> {
75 /// Softmax activation function
76 ///
77 /// Converts a vector of real values into a probability distribution.
78 /// Formula: softmax(x)\[i\] = exp(x\[i\] - max(x)) / sum(exp(x\[j\] - max(x)))
79 ///
80 /// Uses the numerically stable version with max subtraction to prevent overflow.
81 /// The output is a probability distribution: all values in [0, 1] and sum to 1.
82 ///
83 /// This is the standard activation function for multi-class classification in neural networks.
84 ///
85 /// # Examples
86 ///
87 /// ```
88 /// use trueno::Vector;
89 ///
90 /// let logits = Vector::from_slice(&[1.0, 2.0, 3.0]);
91 /// let probs = logits.softmax()?;
92 ///
93 /// // Verify sum ≈ 1
94 /// let sum: f32 = probs.as_slice().iter().sum();
95 /// assert!((sum - 1.0).abs() < 1e-5);
96 ///
97 /// // Verify all values in [0, 1]
98 /// for &p in probs.as_slice() {
99 /// assert!(p >= 0.0 && p <= 1.0);
100 /// }
101 /// # Ok::<(), trueno::TruenoError>(())
102 /// ```
103 ///
104 /// # Empty vectors
105 ///
106 /// Returns EmptyVector error for empty vectors (cannot compute softmax).
107 pub fn softmax(&self) -> Result<Self> {
108 if self.data.is_empty() {
109 return Err(TruenoError::EmptyVector);
110 }
111
112 // OpComplexity::Medium - GPU threshold: >10K elements (multi-pass overhead)
113 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
114 const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 4-368x slower, see docs/performance-analysis.md
115
116 // Try GPU first for large vectors
117 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
118 {
119 if self.data.len() >= GPU_THRESHOLD {
120 use crate::backends::gpu::GpuDevice;
121 if GpuDevice::is_available() {
122 let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
123 let mut result = vec![0.0; self.data.len()];
124 if gpu.softmax(&self.data, &mut result).is_ok() {
125 return Ok(Vector::from_vec(result));
126 }
127 }
128 }
129 }
130
131 // Scalar fallback: Multi-pass softmax for numerical stability
132 // Find max for numerical stability (prevents overflow in exp)
133 let max_val = self.max()?;
134
135 // Compute exp(x - max) for each element
136 let exp_vals: Vec<f32> = self.data.iter().map(|&x| (x - max_val).exp()).collect();
137
138 // Compute sum of exponentials
139 let sum_exp: f32 = exp_vals.iter().sum();
140
141 // Normalize by sum (guard against sum=0 from underflow)
142 let safe_sum = sum_exp.max(f32::EPSILON);
143 let data: Vec<f32> = exp_vals.iter().map(|&e| e / safe_sum).collect();
144
145 Ok(Vector::from_vec(data))
146 }
147
148 /// Log-softmax activation function
149 ///
150 /// Computes the logarithm of the softmax function in a numerically stable way.
151 /// Formula: log_softmax(x)\[i\] = x\[i\] - max(x) - log(sum(exp(x\[j\] - max(x))))
152 ///
153 /// This is more numerically stable than computing log(softmax(x)) and is commonly
154 /// used in neural networks for computing cross-entropy loss.
155 ///
156 /// # Examples
157 ///
158 /// ```
159 /// use trueno::Vector;
160 ///
161 /// let logits = Vector::from_slice(&[1.0, 2.0, 3.0]);
162 /// let log_probs = logits.log_softmax()?;
163 ///
164 /// // Verify exp(log_softmax) = softmax
165 /// let probs_from_log: Vec<f32> = log_probs.as_slice().iter().map(|&x| x.exp()).collect();
166 /// let sum: f32 = probs_from_log.iter().sum();
167 /// assert!((sum - 1.0).abs() < 1e-5);
168 /// # Ok::<(), trueno::TruenoError>(())
169 /// ```
170 ///
171 /// # Empty vectors
172 ///
173 /// Returns EmptyVector error for empty vectors.
174 pub fn log_softmax(&self) -> Result<Self> {
175 if self.data.is_empty() {
176 return Err(TruenoError::EmptyVector);
177 }
178
179 // OpComplexity::Medium - GPU threshold: >10K elements (multi-pass overhead)
180 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
181 const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 4-368x slower, see docs/performance-analysis.md
182
183 // Try GPU first for large vectors
184 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
185 {
186 if self.data.len() >= GPU_THRESHOLD {
187 use crate::backends::gpu::GpuDevice;
188 if GpuDevice::is_available() {
189 let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
190 let mut result = vec![0.0; self.data.len()];
191 if gpu.log_softmax(&self.data, &mut result).is_ok() {
192 return Ok(Vector::from_vec(result));
193 }
194 }
195 }
196 }
197
198 // Scalar fallback: Multi-pass log_softmax for numerical stability
199 // Find max for numerical stability
200 let max_val = self.max()?;
201
202 // Compute exp(x - max) for each element
203 let exp_vals: Vec<f32> = self.data.iter().map(|&x| (x - max_val).exp()).collect();
204
205 // Compute log of sum of exponentials
206 let sum_exp: f32 = exp_vals.iter().sum();
207 let log_sum_exp = sum_exp.max(f32::EPSILON).ln();
208
209 // log_softmax(x)[i] = x[i] - max - log_sum_exp
210 let data: Vec<f32> = self.data.iter().map(|&x| x - max_val - log_sum_exp).collect();
211
212 Ok(Vector::from_vec(data))
213 }
214
215 /// ReLU (Rectified Linear Unit) activation function
216 ///
217 /// Computes the element-wise ReLU: max(0, x).
218 /// ReLU is one of the most widely used activation functions in neural networks.
219 ///
220 /// # Formula
221 ///
222 /// ```text
223 /// relu(x)[i] = max(0, x\[i\])
224 /// = x\[i\] if x\[i\] > 0
225 /// = 0 otherwise
226 /// ```
227 ///
228 /// # Properties
229 ///
230 /// - **Non-linearity**: Introduces non-linearity while preserving linearity for positive values
231 /// - **Sparsity**: Produces exactly zero for negative inputs (sparse activations)
232 /// - **Gradient**: Derivative is 1 for positive inputs, 0 for negative (solves vanishing gradient)
233 /// - **Computational efficiency**: Simple max operation, no exponentials
234 ///
235 /// # Applications
236 ///
237 /// - **Deep neural networks**: Default activation for hidden layers
238 /// - **Convolutional networks**: Standard activation in CNNs
239 /// - **Feature learning**: Encourages sparse representations
240 ///
241 /// # Performance
242 ///
243 /// This operation is memory-bound. SIMD provides modest speedups since
244 /// the computation (comparison and selection) is simpler than memory access.
245 ///
246 /// # Errors
247 ///
248 /// Returns `EmptyVector` if the input vector is empty.
249 ///
250 /// # Examples
251 ///
252 /// ```
253 /// use trueno::Vector;
254 ///
255 /// let v = Vector::from_slice(&[-2.0, -1.0, 0.0, 1.0, 2.0]);
256 /// let result = v.relu()?;
257 /// assert_eq!(result.as_slice(), &[0.0, 0.0, 0.0, 1.0, 2.0]);
258 /// # Ok::<(), trueno::TruenoError>(())
259 /// ```
260 pub fn relu(&self) -> Result<Self> {
261 if self.data.is_empty() {
262 return Err(TruenoError::EmptyVector);
263 }
264
265 // OpComplexity::Low - GPU threshold: >100K elements
266 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
267 const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 2-800x slower, see docs/performance-analysis.md
268
269 // Try GPU first for large vectors
270 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
271 {
272 if self.data.len() >= GPU_THRESHOLD {
273 use crate::backends::gpu::GpuDevice;
274 if GpuDevice::is_available() {
275 let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
276 let mut result = vec![0.0; self.data.len()];
277 if gpu.relu(&self.data, &mut result).is_ok() {
278 return Ok(Vector::from_vec(result));
279 }
280 }
281 }
282 }
283
284 // Uninit: dispatch_unary_op writes every element before any read.
285 let n = self.len();
286 let mut result: Vec<f32> = Vec::with_capacity(n);
287 // SAFETY: Backend activation writes all elements before any read.
288 unsafe {
289 result.set_len(n);
290 }
291
292 // Use parallel processing for very large arrays (reduces TLB pressure and improves cache utilization)
293 #[cfg(feature = "parallel")]
294 {
295 const PARALLEL_THRESHOLD: usize = 500_000; // Increased to avoid overhead at smaller sizes
296 const CHUNK_SIZE: usize = 65536; // 64K elements = 256KB, cache-friendly
297
298 if self.len() >= PARALLEL_THRESHOLD {
299 use rayon::prelude::*;
300
301 self.data.par_chunks(CHUNK_SIZE).zip(result.par_chunks_mut(CHUNK_SIZE)).for_each(
302 |(chunk_in, chunk_out)| {
303 dispatch_unary_op!(self.backend, relu, chunk_in, chunk_out);
304 },
305 );
306
307 return Ok(Vector::from_vec(result)); // Use from_vec to avoid extra copy
308 }
309 }
310
311 // Sequential processing for small arrays or when parallel feature disabled
312 dispatch_unary_op!(self.backend, relu, &self.data, &mut result);
313
314 Ok(Vector::from_vec(result)) // Use from_vec to avoid extra copy
315 }
316
317 /// Sigmoid (logistic) activation function
318 ///
319 /// Computes the element-wise sigmoid: σ(x) = 1 / (1 + e^(-x)).
320 /// Sigmoid is a classic activation function that squashes inputs to the range (0, 1).
321 ///
322 /// # Formula
323 ///
324 /// ```text
325 /// sigmoid(x)[i] = 1 / (1 + exp(-x\[i\]))
326 /// = exp(x\[i\]) / (1 + exp(x\[i\]))
327 /// ```
328 ///
329 /// # Properties
330 ///
331 /// - **Bounded output**: Maps all inputs to (0, 1) range
332 /// - **Smooth**: Infinitely differentiable (C^∞)
333 /// - **Symmetric**: σ(-x) = 1 - σ(x)
334 /// - **Derivative**: σ'(x) = σ(x) * (1 - σ(x))
335 /// - **Interpretable**: Output can be interpreted as probability
336 ///
337 /// # Applications
338 ///
339 /// - **Binary classification**: Final layer for binary output (0 or 1)
340 /// - **Logistic regression**: Traditional ML algorithm
341 /// - **Gating mechanisms**: LSTM/GRU gates (input, forget, output)
342 /// - **Attention mechanisms**: Soft attention weights
343 ///
344 /// # Numerical Considerations
345 ///
346 /// For very large negative inputs (x < -50), exp(-x) overflows to infinity.
347 /// However, sigmoid(x) approaches 0, so we return 0 for numerical stability.
348 /// For very large positive inputs (x > 50), exp(-x) underflows to 0,
349 /// and sigmoid(x) approaches 1.
350 ///
351 /// # Performance
352 ///
353 /// This operation is compute-bound due to the exp() operation. SIMD provides
354 /// modest speedups, but the exponential is the bottleneck.
355 ///
356 /// # Errors
357 ///
358 /// Returns `EmptyVector` if the input vector is empty.
359 ///
360 /// # Examples
361 ///
362 /// ```
363 /// use trueno::Vector;
364 ///
365 /// let v = Vector::from_slice(&[-2.0, 0.0, 2.0]);
366 /// let result = v.sigmoid()?;
367 ///
368 /// // sigmoid(-2) ≈ 0.119, sigmoid(0) = 0.5, sigmoid(2) ≈ 0.881
369 /// assert!((result.as_slice()[0] - 0.119).abs() < 0.001);
370 /// assert!((result.as_slice()[1] - 0.5).abs() < 0.001);
371 /// assert!((result.as_slice()[2] - 0.881).abs() < 0.001);
372 /// # Ok::<(), trueno::TruenoError>(())
373 /// ```
374 pub fn sigmoid(&self) -> Result<Self> {
375 if self.data.is_empty() {
376 return Err(TruenoError::EmptyVector);
377 }
378
379 // OpComplexity::Low - GPU threshold: >100K elements
380 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
381 const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 2-800x slower, see docs/performance-analysis.md
382
383 // Try GPU first for large vectors
384 #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
385 {
386 if self.data.len() >= GPU_THRESHOLD {
387 use crate::backends::gpu::GpuDevice;
388 if GpuDevice::is_available() {
389 let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
390 let mut result = vec![0.0; self.data.len()];
391 if gpu.sigmoid(&self.data, &mut result).is_ok() {
392 return Ok(Vector::from_vec(result));
393 }
394 }
395 }
396 }
397
398 // Uninit: dispatch_unary_op writes every element before any read.
399 let n = self.len();
400 let mut result: Vec<f32> = Vec::with_capacity(n);
401 // SAFETY: Backend activation writes all elements before any read.
402 unsafe {
403 result.set_len(n);
404 }
405
406 // Dispatch to appropriate backend
407 dispatch_unary_op!(self.backend, sigmoid, &self.data, &mut result);
408
409 Ok(Vector::from_vec(result))
410 }
411}