trueno 0.18.0

High-performance SIMD compute library with GPU support, LLM inference engine, and GGUF model loading
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
//! Activation functions for Vector<f32>
//!
//! This module provides neural network activation functions optimized with
//! multi-backend SIMD support (Scalar, SSE2, AVX2, AVX-512, NEON, WASM SIMD).
//!
//! ## Activation Functions
//!
//! - [`softmax`](crate::Vector::softmax): Softmax normalization for classification
//! - [`log_softmax`](crate::Vector::log_softmax): Numerically stable log-softmax
//! - [`relu`](crate::Vector::relu): Rectified Linear Unit
//! - [`sigmoid`](crate::Vector::sigmoid): Logistic sigmoid
//! - [`leaky_relu`](crate::Vector::leaky_relu): Leaky ReLU with configurable slope
//! - [`elu`](crate::Vector::elu): Exponential Linear Unit
//! - [`gelu`](crate::Vector::gelu): Gaussian Error Linear Unit
//! - [`swish`](crate::Vector::swish): Self-gated activation (SiLU)
//! - [`hardswish`](crate::Vector::hardswish): Efficient hardware-friendly swish
//! - [`mish`](crate::Vector::mish): Self-regularizing activation
//! - [`selu`](crate::Vector::selu): Scaled Exponential Linear Unit

mod advanced;

#[cfg(test)]
mod tests;

use crate::backends::scalar::ScalarBackend;
use crate::backends::VectorBackend;
use crate::vector::Vector;
use crate::{Backend, Result, TruenoError};

/// Backend dispatch macro for unary operations - centralizes platform-specific SIMD dispatch
/// to eliminate code duplication across activation functions.
///
/// # Safety
/// The macro wraps unsafe backend calls internally, so callers don't need unsafe blocks.
macro_rules! dispatch_unary_op {
    ($backend:expr, $op:ident, $input:expr, $output:expr) => {{
        #[cfg(target_arch = "x86_64")]
        use crate::backends::{avx2::Avx2Backend, sse2::Sse2Backend};
        // SAFETY: CPU features verified at runtime before backend selection
        unsafe {
            match $backend {
                Backend::Scalar => ScalarBackend::$op($input, $output),
                #[cfg(target_arch = "x86_64")]
                Backend::SSE2 | Backend::AVX => Sse2Backend::$op($input, $output),
                #[cfg(target_arch = "x86_64")]
                Backend::AVX2 | Backend::AVX512 => Avx2Backend::$op($input, $output),
                #[cfg(not(target_arch = "x86_64"))]
                Backend::SSE2 | Backend::AVX | Backend::AVX2 | Backend::AVX512 => {
                    ScalarBackend::$op($input, $output)
                }
                #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
                Backend::NEON => {
                    use crate::backends::neon::NeonBackend;
                    NeonBackend::$op($input, $output)
                }
                #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
                Backend::NEON => ScalarBackend::$op($input, $output),
                #[cfg(target_arch = "wasm32")]
                Backend::WasmSIMD => {
                    use crate::backends::wasm::WasmBackend;
                    WasmBackend::$op($input, $output)
                }
                #[cfg(not(target_arch = "wasm32"))]
                Backend::WasmSIMD => ScalarBackend::$op($input, $output),
                Backend::GPU | Backend::Auto => ScalarBackend::$op($input, $output),
            }
        }
    }};
}

// Re-export macro for use in advanced submodule
pub(crate) use dispatch_unary_op;

impl Vector<f32> {
    /// Softmax activation function
    ///
    /// Converts a vector of real values into a probability distribution.
    /// Formula: softmax(x)\[i\] = exp(x\[i\] - max(x)) / sum(exp(x\[j\] - max(x)))
    ///
    /// Uses the numerically stable version with max subtraction to prevent overflow.
    /// The output is a probability distribution: all values in [0, 1] and sum to 1.
    ///
    /// This is the standard activation function for multi-class classification in neural networks.
    ///
    /// # Examples
    ///
    /// ```
    /// use trueno::Vector;
    ///
    /// let logits = Vector::from_slice(&[1.0, 2.0, 3.0]);
    /// let probs = logits.softmax()?;
    ///
    /// // Verify sum ≈ 1
    /// let sum: f32 = probs.as_slice().iter().sum();
    /// assert!((sum - 1.0).abs() < 1e-5);
    ///
    /// // Verify all values in [0, 1]
    /// for &p in probs.as_slice() {
    ///     assert!(p >= 0.0 && p <= 1.0);
    /// }
    /// # Ok::<(), trueno::TruenoError>(())
    /// ```
    ///
    /// # Empty vectors
    ///
    /// Returns EmptyVector error for empty vectors (cannot compute softmax).
    pub fn softmax(&self) -> Result<Self> {
        if self.data.is_empty() {
            return Err(TruenoError::EmptyVector);
        }

        // OpComplexity::Medium - GPU threshold: >10K elements (multi-pass overhead)
        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
        const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 4-368x slower, see docs/performance-analysis.md

        // Try GPU first for large vectors
        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
        {
            if self.data.len() >= GPU_THRESHOLD {
                use crate::backends::gpu::GpuDevice;
                if GpuDevice::is_available() {
                    let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
                    let mut result = vec![0.0; self.data.len()];
                    if gpu.softmax(&self.data, &mut result).is_ok() {
                        return Ok(Vector::from_vec(result));
                    }
                }
            }
        }

        // Scalar fallback: Multi-pass softmax for numerical stability
        // Find max for numerical stability (prevents overflow in exp)
        let max_val = self.max()?;

        // Compute exp(x - max) for each element
        let exp_vals: Vec<f32> = self.data.iter().map(|&x| (x - max_val).exp()).collect();

        // Compute sum of exponentials
        let sum_exp: f32 = exp_vals.iter().sum();

        // Normalize by sum (guard against sum=0 from underflow)
        let safe_sum = sum_exp.max(f32::EPSILON);
        let data: Vec<f32> = exp_vals.iter().map(|&e| e / safe_sum).collect();

        Ok(Vector::from_vec(data))
    }

    /// Log-softmax activation function
    ///
    /// Computes the logarithm of the softmax function in a numerically stable way.
    /// Formula: log_softmax(x)\[i\] = x\[i\] - max(x) - log(sum(exp(x\[j\] - max(x))))
    ///
    /// This is more numerically stable than computing log(softmax(x)) and is commonly
    /// used in neural networks for computing cross-entropy loss.
    ///
    /// # Examples
    ///
    /// ```
    /// use trueno::Vector;
    ///
    /// let logits = Vector::from_slice(&[1.0, 2.0, 3.0]);
    /// let log_probs = logits.log_softmax()?;
    ///
    /// // Verify exp(log_softmax) = softmax
    /// let probs_from_log: Vec<f32> = log_probs.as_slice().iter().map(|&x| x.exp()).collect();
    /// let sum: f32 = probs_from_log.iter().sum();
    /// assert!((sum - 1.0).abs() < 1e-5);
    /// # Ok::<(), trueno::TruenoError>(())
    /// ```
    ///
    /// # Empty vectors
    ///
    /// Returns EmptyVector error for empty vectors.
    pub fn log_softmax(&self) -> Result<Self> {
        if self.data.is_empty() {
            return Err(TruenoError::EmptyVector);
        }

        // OpComplexity::Medium - GPU threshold: >10K elements (multi-pass overhead)
        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
        const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 4-368x slower, see docs/performance-analysis.md

        // Try GPU first for large vectors
        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
        {
            if self.data.len() >= GPU_THRESHOLD {
                use crate::backends::gpu::GpuDevice;
                if GpuDevice::is_available() {
                    let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
                    let mut result = vec![0.0; self.data.len()];
                    if gpu.log_softmax(&self.data, &mut result).is_ok() {
                        return Ok(Vector::from_vec(result));
                    }
                }
            }
        }

        // Scalar fallback: Multi-pass log_softmax for numerical stability
        // Find max for numerical stability
        let max_val = self.max()?;

        // Compute exp(x - max) for each element
        let exp_vals: Vec<f32> = self.data.iter().map(|&x| (x - max_val).exp()).collect();

        // Compute log of sum of exponentials
        let sum_exp: f32 = exp_vals.iter().sum();
        let log_sum_exp = sum_exp.max(f32::EPSILON).ln();

        // log_softmax(x)[i] = x[i] - max - log_sum_exp
        let data: Vec<f32> = self.data.iter().map(|&x| x - max_val - log_sum_exp).collect();

        Ok(Vector::from_vec(data))
    }

    /// ReLU (Rectified Linear Unit) activation function
    ///
    /// Computes the element-wise ReLU: max(0, x).
    /// ReLU is one of the most widely used activation functions in neural networks.
    ///
    /// # Formula
    ///
    /// ```text
    /// relu(x)[i] = max(0, x\[i\])
    ///            = x\[i\]  if x\[i\] > 0
    ///            = 0     otherwise
    /// ```
    ///
    /// # Properties
    ///
    /// - **Non-linearity**: Introduces non-linearity while preserving linearity for positive values
    /// - **Sparsity**: Produces exactly zero for negative inputs (sparse activations)
    /// - **Gradient**: Derivative is 1 for positive inputs, 0 for negative (solves vanishing gradient)
    /// - **Computational efficiency**: Simple max operation, no exponentials
    ///
    /// # Applications
    ///
    /// - **Deep neural networks**: Default activation for hidden layers
    /// - **Convolutional networks**: Standard activation in CNNs
    /// - **Feature learning**: Encourages sparse representations
    ///
    /// # Performance
    ///
    /// This operation is memory-bound. SIMD provides modest speedups since
    /// the computation (comparison and selection) is simpler than memory access.
    ///
    /// # Errors
    ///
    /// Returns `EmptyVector` if the input vector is empty.
    ///
    /// # Examples
    ///
    /// ```
    /// use trueno::Vector;
    ///
    /// let v = Vector::from_slice(&[-2.0, -1.0, 0.0, 1.0, 2.0]);
    /// let result = v.relu()?;
    /// assert_eq!(result.as_slice(), &[0.0, 0.0, 0.0, 1.0, 2.0]);
    /// # Ok::<(), trueno::TruenoError>(())
    /// ```
    pub fn relu(&self) -> Result<Self> {
        if self.data.is_empty() {
            return Err(TruenoError::EmptyVector);
        }

        // OpComplexity::Low - GPU threshold: >100K elements
        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
        const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 2-800x slower, see docs/performance-analysis.md

        // Try GPU first for large vectors
        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
        {
            if self.data.len() >= GPU_THRESHOLD {
                use crate::backends::gpu::GpuDevice;
                if GpuDevice::is_available() {
                    let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
                    let mut result = vec![0.0; self.data.len()];
                    if gpu.relu(&self.data, &mut result).is_ok() {
                        return Ok(Vector::from_vec(result));
                    }
                }
            }
        }

        // Uninit: dispatch_unary_op writes every element before any read.
        let n = self.len();
        let mut result: Vec<f32> = Vec::with_capacity(n);
        // SAFETY: Backend activation writes all elements before any read.
        unsafe {
            result.set_len(n);
        }

        // Use parallel processing for very large arrays (reduces TLB pressure and improves cache utilization)
        #[cfg(feature = "parallel")]
        {
            const PARALLEL_THRESHOLD: usize = 500_000; // Increased to avoid overhead at smaller sizes
            const CHUNK_SIZE: usize = 65536; // 64K elements = 256KB, cache-friendly

            if self.len() >= PARALLEL_THRESHOLD {
                use rayon::prelude::*;

                self.data.par_chunks(CHUNK_SIZE).zip(result.par_chunks_mut(CHUNK_SIZE)).for_each(
                    |(chunk_in, chunk_out)| {
                        dispatch_unary_op!(self.backend, relu, chunk_in, chunk_out);
                    },
                );

                return Ok(Vector::from_vec(result)); // Use from_vec to avoid extra copy
            }
        }

        // Sequential processing for small arrays or when parallel feature disabled
        dispatch_unary_op!(self.backend, relu, &self.data, &mut result);

        Ok(Vector::from_vec(result)) // Use from_vec to avoid extra copy
    }

    /// Sigmoid (logistic) activation function
    ///
    /// Computes the element-wise sigmoid: σ(x) = 1 / (1 + e^(-x)).
    /// Sigmoid is a classic activation function that squashes inputs to the range (0, 1).
    ///
    /// # Formula
    ///
    /// ```text
    /// sigmoid(x)[i] = 1 / (1 + exp(-x\[i\]))
    ///               = exp(x\[i\]) / (1 + exp(x\[i\]))
    /// ```
    ///
    /// # Properties
    ///
    /// - **Bounded output**: Maps all inputs to (0, 1) range
    /// - **Smooth**: Infinitely differentiable (C^∞)
    /// - **Symmetric**: σ(-x) = 1 - σ(x)
    /// - **Derivative**: σ'(x) = σ(x) * (1 - σ(x))
    /// - **Interpretable**: Output can be interpreted as probability
    ///
    /// # Applications
    ///
    /// - **Binary classification**: Final layer for binary output (0 or 1)
    /// - **Logistic regression**: Traditional ML algorithm
    /// - **Gating mechanisms**: LSTM/GRU gates (input, forget, output)
    /// - **Attention mechanisms**: Soft attention weights
    ///
    /// # Numerical Considerations
    ///
    /// For very large negative inputs (x < -50), exp(-x) overflows to infinity.
    /// However, sigmoid(x) approaches 0, so we return 0 for numerical stability.
    /// For very large positive inputs (x > 50), exp(-x) underflows to 0,
    /// and sigmoid(x) approaches 1.
    ///
    /// # Performance
    ///
    /// This operation is compute-bound due to the exp() operation. SIMD provides
    /// modest speedups, but the exponential is the bottleneck.
    ///
    /// # Errors
    ///
    /// Returns `EmptyVector` if the input vector is empty.
    ///
    /// # Examples
    ///
    /// ```
    /// use trueno::Vector;
    ///
    /// let v = Vector::from_slice(&[-2.0, 0.0, 2.0]);
    /// let result = v.sigmoid()?;
    ///
    /// // sigmoid(-2) ≈ 0.119, sigmoid(0) = 0.5, sigmoid(2) ≈ 0.881
    /// assert!((result.as_slice()[0] - 0.119).abs() < 0.001);
    /// assert!((result.as_slice()[1] - 0.5).abs() < 0.001);
    /// assert!((result.as_slice()[2] - 0.881).abs() < 0.001);
    /// # Ok::<(), trueno::TruenoError>(())
    /// ```
    pub fn sigmoid(&self) -> Result<Self> {
        if self.data.is_empty() {
            return Err(TruenoError::EmptyVector);
        }

        // OpComplexity::Low - GPU threshold: >100K elements
        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
        const GPU_THRESHOLD: usize = usize::MAX; // GPU DISABLED - 2-800x slower, see docs/performance-analysis.md

        // Try GPU first for large vectors
        #[cfg(all(feature = "gpu", not(target_arch = "wasm32")))]
        {
            if self.data.len() >= GPU_THRESHOLD {
                use crate::backends::gpu::GpuDevice;
                if GpuDevice::is_available() {
                    let gpu = GpuDevice::new().map_err(TruenoError::InvalidInput)?;
                    let mut result = vec![0.0; self.data.len()];
                    if gpu.sigmoid(&self.data, &mut result).is_ok() {
                        return Ok(Vector::from_vec(result));
                    }
                }
            }
        }

        // Uninit: dispatch_unary_op writes every element before any read.
        let n = self.len();
        let mut result: Vec<f32> = Vec::with_capacity(n);
        // SAFETY: Backend activation writes all elements before any read.
        unsafe {
            result.set_len(n);
        }

        // Dispatch to appropriate backend
        dispatch_unary_op!(self.backend, sigmoid, &self.data, &mut result);

        Ok(Vector::from_vec(result))
    }
}