trueno/vector/ops/arithmetic/
mod.rs

1//! Arithmetic operations for Vector<f32>
2//!
3//! This module provides element-wise arithmetic operations:
4//! - Basic: `add`, `sub`, `mul`, `div`
5//! - Scalar: `scale`
6//! - Fused: `fma` (fused multiply-add)
7
8#[cfg(target_arch = "x86_64")]
9use crate::backends::avx2::Avx2Backend;
10#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
11use crate::backends::neon::NeonBackend;
12use crate::backends::scalar::ScalarBackend;
13#[cfg(target_arch = "x86_64")]
14use crate::backends::sse2::Sse2Backend;
15#[cfg(target_arch = "wasm32")]
16use crate::backends::wasm::WasmBackend;
17use crate::backends::VectorBackend;
18use crate::vector::Vector;
19use crate::{dispatch_binary_op, Backend, Result, TruenoError};
20
21impl Vector<f32> {
22    /// Element-wise addition
23    ///
24    /// # Performance
25    ///
26    /// Auto-selects the best available backend:
27    /// - **AVX2**: ~4x faster than scalar for 1K+ elements
28    /// - **GPU**: ~50x faster than scalar for 10M+ elements
29    ///
30    /// # Examples
31    ///
32    /// ```
33    /// use trueno::Vector;
34    ///
35    /// let a = Vector::from_slice(&[1.0, 2.0, 3.0]);
36    /// let b = Vector::from_slice(&[4.0, 5.0, 6.0]);
37    /// let result = a.add(&b)?;
38    ///
39    /// assert_eq!(result.as_slice(), &[5.0, 7.0, 9.0]);
40    /// # Ok::<(), trueno::TruenoError>(())
41    /// ```
42    ///
43    /// # Errors
44    ///
45    /// Returns [`TruenoError::SizeMismatch`] if vectors have different lengths.
46    pub fn add(&self, other: &Self) -> Result<Self> {
47        if self.len() != other.len() {
48            return Err(TruenoError::SizeMismatch { expected: self.len(), actual: other.len() });
49        }
50
51        // Uninit allocation: avoids the zero-fill cost (70µs+ at 1M elements)
52        // since every element will be overwritten by dispatch_binary_op below.
53        // SAFETY: dispatch_binary_op!(..., add, a, b, out) writes to EVERY element
54        // of `out` (it's an element-wise add). No reads before writes.
55        let n = self.len();
56        let mut result: Vec<f32> = Vec::with_capacity(n);
57        unsafe {
58            result.set_len(n);
59        }
60
61        // Use parallel processing for large arrays
62        #[cfg(feature = "parallel")]
63        {
64            const PARALLEL_THRESHOLD: usize = 100_000; // Threshold for element-wise ops
65            const CHUNK_SIZE: usize = 65536; // 64K elements = 256KB, cache-friendly
66
67            if self.len() >= PARALLEL_THRESHOLD {
68                use rayon::prelude::*;
69
70                self.data
71                    .par_chunks(CHUNK_SIZE)
72                    .zip(other.data.par_chunks(CHUNK_SIZE))
73                    .zip(result.par_chunks_mut(CHUNK_SIZE))
74                    .for_each(|((chunk_a, chunk_b), chunk_out)| {
75                        dispatch_binary_op!(self.backend, add, chunk_a, chunk_b, chunk_out);
76                    });
77
78                return Ok(Self { data: result, backend: self.backend });
79            }
80        }
81
82        dispatch_binary_op!(self.backend, add, &self.data, &other.data, &mut result);
83
84        Ok(Self { data: result, backend: self.backend })
85    }
86
87    /// Element-wise subtraction
88    ///
89    /// # Performance
90    ///
91    /// Auto-selects the best available backend:
92    /// - **AVX2**: ~4x faster than scalar for 1K+ elements
93    /// - **GPU**: ~50x faster than scalar for 10M+ elements
94    ///
95    /// # Examples
96    ///
97    /// ```
98    /// use trueno::Vector;
99    ///
100    /// let a = Vector::from_slice(&[5.0, 7.0, 9.0]);
101    /// let b = Vector::from_slice(&[1.0, 2.0, 3.0]);
102    /// let result = a.sub(&b)?;
103    ///
104    /// assert_eq!(result.as_slice(), &[4.0, 5.0, 6.0]);
105    /// # Ok::<(), trueno::TruenoError>(())
106    /// ```
107    ///
108    /// # Errors
109    ///
110    /// Returns [`TruenoError::SizeMismatch`] if vectors have different lengths.
111    pub fn sub(&self, other: &Self) -> Result<Self> {
112        if self.len() != other.len() {
113            return Err(TruenoError::SizeMismatch { expected: self.len(), actual: other.len() });
114        }
115
116        // Uninit allocation: skip zero-fill since dispatch_binary_op writes all elements.
117        let n = self.len();
118        let mut result: Vec<f32> = Vec::with_capacity(n);
119        // SAFETY: Every element is written before any read (by element-wise op below).
120        unsafe {
121            result.set_len(n);
122        }
123
124        // Use parallel processing for large arrays
125        #[cfg(feature = "parallel")]
126        {
127            const PARALLEL_THRESHOLD: usize = 100_000;
128            const CHUNK_SIZE: usize = 65536;
129
130            if self.len() >= PARALLEL_THRESHOLD {
131                use rayon::prelude::*;
132
133                self.data
134                    .par_chunks(CHUNK_SIZE)
135                    .zip(other.data.par_chunks(CHUNK_SIZE))
136                    .zip(result.par_chunks_mut(CHUNK_SIZE))
137                    .for_each(|((chunk_a, chunk_b), chunk_out)| {
138                        dispatch_binary_op!(self.backend, sub, chunk_a, chunk_b, chunk_out);
139                    });
140
141                return Ok(Self { data: result, backend: self.backend });
142            }
143        }
144
145        dispatch_binary_op!(self.backend, sub, &self.data, &other.data, &mut result);
146
147        Ok(Self { data: result, backend: self.backend })
148    }
149
150    /// Element-wise multiplication
151    ///
152    /// # Examples
153    ///
154    /// ```
155    /// use trueno::Vector;
156    ///
157    /// let a = Vector::from_slice(&[2.0, 3.0, 4.0]);
158    /// let b = Vector::from_slice(&[5.0, 6.0, 7.0]);
159    /// let result = a.mul(&b)?;
160    ///
161    /// assert_eq!(result.as_slice(), &[10.0, 18.0, 28.0]);
162    /// # Ok::<(), trueno::TruenoError>(())
163    /// ```
164    pub fn mul(&self, other: &Self) -> Result<Self> {
165        if self.len() != other.len() {
166            return Err(TruenoError::SizeMismatch { expected: self.len(), actual: other.len() });
167        }
168
169        // Uninit allocation: skip zero-fill since dispatch_binary_op writes all elements.
170        let n = self.len();
171        let mut result: Vec<f32> = Vec::with_capacity(n);
172        // SAFETY: Every element is written before any read (by element-wise op below).
173        unsafe {
174            result.set_len(n);
175        }
176
177        // Use parallel processing for large arrays
178        #[cfg(feature = "parallel")]
179        {
180            const PARALLEL_THRESHOLD: usize = 100_000;
181            const CHUNK_SIZE: usize = 65536;
182
183            if self.len() >= PARALLEL_THRESHOLD {
184                use rayon::prelude::*;
185
186                self.data
187                    .par_chunks(CHUNK_SIZE)
188                    .zip(other.data.par_chunks(CHUNK_SIZE))
189                    .zip(result.par_chunks_mut(CHUNK_SIZE))
190                    .for_each(|((chunk_a, chunk_b), chunk_out)| {
191                        dispatch_binary_op!(self.backend, mul, chunk_a, chunk_b, chunk_out);
192                    });
193
194                return Ok(Self { data: result, backend: self.backend });
195            }
196        }
197
198        dispatch_binary_op!(self.backend, mul, &self.data, &other.data, &mut result);
199
200        Ok(Self { data: result, backend: self.backend })
201    }
202
203    /// Element-wise division
204    ///
205    /// # Examples
206    ///
207    /// ```
208    /// use trueno::Vector;
209    ///
210    /// let a = Vector::from_slice(&[10.0, 20.0, 30.0]);
211    /// let b = Vector::from_slice(&[2.0, 4.0, 5.0]);
212    /// let result = a.div(&b)?;
213    ///
214    /// assert_eq!(result.as_slice(), &[5.0, 5.0, 6.0]);
215    /// # Ok::<(), trueno::TruenoError>(())
216    /// ```
217    pub fn div(&self, other: &Self) -> Result<Self> {
218        if self.len() != other.len() {
219            return Err(TruenoError::SizeMismatch { expected: self.len(), actual: other.len() });
220        }
221
222        // Uninit allocation: skip zero-fill since dispatch_binary_op writes all elements.
223        let n = self.len();
224        let mut result: Vec<f32> = Vec::with_capacity(n);
225        // SAFETY: Every element is written before any read (by element-wise op below).
226        unsafe {
227            result.set_len(n);
228        }
229
230        // Use parallel processing for large arrays
231        #[cfg(feature = "parallel")]
232        {
233            const PARALLEL_THRESHOLD: usize = 100_000;
234            const CHUNK_SIZE: usize = 65536;
235
236            if self.len() >= PARALLEL_THRESHOLD {
237                use rayon::prelude::*;
238
239                self.data
240                    .par_chunks(CHUNK_SIZE)
241                    .zip(other.data.par_chunks(CHUNK_SIZE))
242                    .zip(result.par_chunks_mut(CHUNK_SIZE))
243                    .for_each(|((chunk_a, chunk_b), chunk_out)| {
244                        dispatch_binary_op!(self.backend, div, chunk_a, chunk_b, chunk_out);
245                    });
246
247                return Ok(Self { data: result, backend: self.backend });
248            }
249        }
250
251        dispatch_binary_op!(self.backend, div, &self.data, &other.data, &mut result);
252
253        Ok(Self { data: result, backend: self.backend })
254    }
255
256    /// Scalar multiplication (scale all elements by a scalar value)
257    ///
258    /// Returns a new vector where each element is multiplied by the scalar.
259    ///
260    /// # Examples
261    ///
262    /// ```
263    /// use trueno::Vector;
264    ///
265    /// let v = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]);
266    /// let result = v.scale(2.0)?;
267    ///
268    /// assert_eq!(result.as_slice(), &[2.0, 4.0, 6.0, 8.0]);
269    /// # Ok::<(), trueno::TruenoError>(())
270    /// ```
271    ///
272    /// # Scaling by Zero
273    ///
274    /// ```
275    /// use trueno::Vector;
276    ///
277    /// let v = Vector::from_slice(&[1.0, 2.0, 3.0]);
278    /// let result = v.scale(0.0)?;
279    /// assert_eq!(result.as_slice(), &[0.0, 0.0, 0.0]);
280    /// # Ok::<(), trueno::TruenoError>(())
281    /// ```
282    ///
283    /// # Negative Scaling
284    ///
285    /// ```
286    /// use trueno::Vector;
287    ///
288    /// let v = Vector::from_slice(&[1.0, -2.0, 3.0]);
289    /// let result = v.scale(-2.0)?;
290    /// assert_eq!(result.as_slice(), &[-2.0, 4.0, -6.0]);
291    /// # Ok::<(), trueno::TruenoError>(())
292    /// ```
293    pub fn scale(&self, scalar: f32) -> Result<Vector<f32>> {
294        // Uninit allocation: backend writes all elements.
295        let n = self.len();
296        let mut result_data: Vec<f32> = Vec::with_capacity(n);
297        // SAFETY: backend scale() writes every element before any read.
298        unsafe {
299            result_data.set_len(n);
300        }
301
302        if !self.data.is_empty() {
303            // SAFETY: Unsafe block delegates to backend implementation which maintains safety invariants
304            unsafe {
305                match self.backend {
306                    Backend::Scalar => ScalarBackend::scale(&self.data, scalar, &mut result_data),
307                    #[cfg(target_arch = "x86_64")]
308                    Backend::SSE2 | Backend::AVX => {
309                        Sse2Backend::scale(&self.data, scalar, &mut result_data)
310                    }
311                    #[cfg(target_arch = "x86_64")]
312                    Backend::AVX2 | Backend::AVX512 => {
313                        Avx2Backend::scale(&self.data, scalar, &mut result_data)
314                    }
315                    #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
316                    Backend::NEON => NeonBackend::scale(&self.data, scalar, &mut result_data),
317                    #[cfg(target_arch = "wasm32")]
318                    Backend::WasmSIMD => WasmBackend::scale(&self.data, scalar, &mut result_data),
319                    Backend::GPU => return Err(TruenoError::UnsupportedBackend(Backend::GPU)),
320                    Backend::Auto => {
321                        // Auto should have been resolved at creation time
322                        return Err(TruenoError::UnsupportedBackend(Backend::Auto));
323                    }
324                    #[cfg(not(target_arch = "x86_64"))]
325                    Backend::SSE2 | Backend::AVX | Backend::AVX2 | Backend::AVX512 => {
326                        ScalarBackend::scale(&self.data, scalar, &mut result_data)
327                    }
328                    #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
329                    Backend::NEON => ScalarBackend::scale(&self.data, scalar, &mut result_data),
330                    #[cfg(not(target_arch = "wasm32"))]
331                    Backend::WasmSIMD => ScalarBackend::scale(&self.data, scalar, &mut result_data),
332                }
333            }
334        }
335
336        Ok(Vector { data: result_data, backend: self.backend })
337    }
338
339    /// Fused multiply-add: result\[i\] = self\[i\] * b\[i\] + c\[i\]
340    ///
341    /// Computes element-wise fused multiply-add operation. On hardware with FMA support
342    /// (AVX2, NEON), this is a single instruction with better performance and numerical
343    /// accuracy (no intermediate rounding). On platforms without FMA (SSE2, WASM), uses
344    /// separate multiply and add operations.
345    ///
346    /// # Arguments
347    ///
348    /// * `b` - The second vector to multiply with
349    /// * `c` - The vector to add to the product
350    ///
351    /// # Returns
352    ///
353    /// A new vector where each element is `self\[i\] * b\[i\] + c\[i\]`
354    ///
355    /// # Errors
356    ///
357    /// Returns `SizeMismatch` if vector lengths don't match
358    ///
359    /// # Examples
360    ///
361    /// ```
362    /// use trueno::Vector;
363    ///
364    /// let a = Vector::from_slice(&[2.0, 3.0, 4.0]);
365    /// let b = Vector::from_slice(&[5.0, 6.0, 7.0]);
366    /// let c = Vector::from_slice(&[1.0, 2.0, 3.0]);
367    /// let result = a.fma(&b, &c)?;
368    /// assert_eq!(result.as_slice(), &[11.0, 20.0, 31.0]);  // [2*5+1, 3*6+2, 4*7+3]
369    /// # Ok::<(), trueno::TruenoError>(())
370    /// ```
371    ///
372    /// # Use Cases
373    ///
374    /// - Neural networks: matrix multiplication, backpropagation
375    /// - Scientific computing: polynomial evaluation, numerical integration
376    /// - Graphics: transformation matrices, shader computations
377    /// - Physics simulations: force calculations, particle systems
378    pub fn fma(&self, b: &Vector<f32>, c: &Vector<f32>) -> Result<Vector<f32>> {
379        if self.len() != b.len() {
380            return Err(TruenoError::SizeMismatch { expected: self.len(), actual: b.len() });
381        }
382        if self.len() != c.len() {
383            return Err(TruenoError::SizeMismatch { expected: self.len(), actual: c.len() });
384        }
385
386        // Uninit allocation: backend fma writes all elements.
387        let n = self.len();
388        let mut result_data: Vec<f32> = Vec::with_capacity(n);
389        // SAFETY: backend fma() writes every element before any read.
390        unsafe {
391            result_data.set_len(n);
392        }
393
394        if !self.data.is_empty() {
395            // SAFETY: Unsafe block delegates to backend implementation which maintains safety invariants
396            unsafe {
397                match self.backend {
398                    Backend::Scalar => {
399                        ScalarBackend::fma(&self.data, &b.data, &c.data, &mut result_data)
400                    }
401                    #[cfg(target_arch = "x86_64")]
402                    Backend::SSE2 | Backend::AVX => {
403                        Sse2Backend::fma(&self.data, &b.data, &c.data, &mut result_data)
404                    }
405                    #[cfg(target_arch = "x86_64")]
406                    Backend::AVX2 | Backend::AVX512 => {
407                        Avx2Backend::fma(&self.data, &b.data, &c.data, &mut result_data)
408                    }
409                    #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
410                    Backend::NEON => {
411                        NeonBackend::fma(&self.data, &b.data, &c.data, &mut result_data)
412                    }
413                    #[cfg(target_arch = "wasm32")]
414                    Backend::WasmSIMD => {
415                        WasmBackend::fma(&self.data, &b.data, &c.data, &mut result_data)
416                    }
417                    Backend::GPU => return Err(TruenoError::UnsupportedBackend(Backend::GPU)),
418                    Backend::Auto => {
419                        return Err(TruenoError::UnsupportedBackend(Backend::Auto));
420                    }
421                    #[cfg(not(target_arch = "x86_64"))]
422                    Backend::SSE2 | Backend::AVX | Backend::AVX2 | Backend::AVX512 => {
423                        ScalarBackend::fma(&self.data, &b.data, &c.data, &mut result_data)
424                    }
425                    #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
426                    Backend::NEON => {
427                        ScalarBackend::fma(&self.data, &b.data, &c.data, &mut result_data)
428                    }
429                    #[cfg(not(target_arch = "wasm32"))]
430                    Backend::WasmSIMD => {
431                        ScalarBackend::fma(&self.data, &b.data, &c.data, &mut result_data)
432                    }
433                }
434            }
435        }
436
437        Ok(Vector { data: result_data, backend: self.backend })
438    }
439}
440
441#[cfg(test)]
442mod tests;
trueno/vector/ops/arithmetic/mod.rs

trueno/vector/ops/arithmetic/
mod.rs