archx 3.0.1

High-performance CPU/GPU adaptive optimization library with SIMD and Multithreading
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/// Scalar implementation of the add operation.
/// 
/// WHY: This serves as the ultimate fallback for any CPU that does not 
/// support SIMD instructions or for which an optimized path hasn't been written.
/// It uses a simple loop which modern compilers can often auto-vectorize.
pub fn add_impl(a: &[f32], b: &[f32], out: &mut [f32]) {
    // Basic bounds check or assuming slices are same length for v0.1.
    // In a real library, we'd handle length mismatches more robustly.
    let len = a.len().min(b.len()).min(out.len());
    
    for i in 0..len {
        out[i] = a[i] + b[i];
    }
}