use rayon::prelude::*;
use wide::{f32x8, f64x4, i32x8};
pub struct SimdOps;
impl SimdOps {
#[inline]
pub fn saxpy(a: f32, x: &[f32], y: &mut [f32]) {
let n = x.len().min(y.len());
let a_vec = f32x8::splat(a);
let chunks = n / 8;
let remainder = n % 8;
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
let y_vec = f32x8::new([
y[offset],
y[offset + 1],
y[offset + 2],
y[offset + 3],
y[offset + 4],
y[offset + 5],
y[offset + 6],
y[offset + 7],
]);
let result = a_vec * x_vec + y_vec;
let arr: [f32; 8] = result.into();
y[offset..offset + 8].copy_from_slice(&arr);
}
let tail_start = chunks * 8;
for i in 0..remainder {
y[tail_start + i] += a * x[tail_start + i];
}
}
#[inline]
pub fn daxpy(a: f64, x: &[f64], y: &mut [f64]) {
let n = x.len().min(y.len());
let a_vec = f64x4::splat(a);
let chunks = n / 4;
let remainder = n % 4;
for i in 0..chunks {
let offset = i * 4;
let x_vec = f64x4::new([x[offset], x[offset + 1], x[offset + 2], x[offset + 3]]);
let y_vec = f64x4::new([y[offset], y[offset + 1], y[offset + 2], y[offset + 3]]);
let result = a_vec * x_vec + y_vec;
let arr: [f64; 4] = result.into();
y[offset..offset + 4].copy_from_slice(&arr);
}
let tail_start = chunks * 4;
for i in 0..remainder {
y[tail_start + i] += a * x[tail_start + i];
}
}
#[inline]
pub fn add_f32(x: &[f32], y: &[f32], z: &mut [f32]) {
let n = x.len().min(y.len()).min(z.len());
let chunks = n / 8;
let remainder = n % 8;
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
let y_vec = f32x8::new([
y[offset],
y[offset + 1],
y[offset + 2],
y[offset + 3],
y[offset + 4],
y[offset + 5],
y[offset + 6],
y[offset + 7],
]);
let result = x_vec + y_vec;
let arr: [f32; 8] = result.into();
z[offset..offset + 8].copy_from_slice(&arr);
}
let tail_start = chunks * 8;
for i in 0..remainder {
z[tail_start + i] = x[tail_start + i] + y[tail_start + i];
}
}
#[inline]
pub fn sub_f32(x: &[f32], y: &[f32], z: &mut [f32]) {
let n = x.len().min(y.len()).min(z.len());
let chunks = n / 8;
let remainder = n % 8;
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
let y_vec = f32x8::new([
y[offset],
y[offset + 1],
y[offset + 2],
y[offset + 3],
y[offset + 4],
y[offset + 5],
y[offset + 6],
y[offset + 7],
]);
let result = x_vec - y_vec;
let arr: [f32; 8] = result.into();
z[offset..offset + 8].copy_from_slice(&arr);
}
let tail_start = chunks * 8;
for i in 0..remainder {
z[tail_start + i] = x[tail_start + i] - y[tail_start + i];
}
}
#[inline]
pub fn mul_f32(x: &[f32], y: &[f32], z: &mut [f32]) {
let n = x.len().min(y.len()).min(z.len());
let chunks = n / 8;
let remainder = n % 8;
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
let y_vec = f32x8::new([
y[offset],
y[offset + 1],
y[offset + 2],
y[offset + 3],
y[offset + 4],
y[offset + 5],
y[offset + 6],
y[offset + 7],
]);
let result = x_vec * y_vec;
let arr: [f32; 8] = result.into();
z[offset..offset + 8].copy_from_slice(&arr);
}
let tail_start = chunks * 8;
for i in 0..remainder {
z[tail_start + i] = x[tail_start + i] * y[tail_start + i];
}
}
#[inline]
pub fn dot_f32(x: &[f32], y: &[f32]) -> f32 {
let n = x.len().min(y.len());
let chunks = n / 8;
let remainder = n % 8;
let mut acc = f32x8::splat(0.0);
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
let y_vec = f32x8::new([
y[offset],
y[offset + 1],
y[offset + 2],
y[offset + 3],
y[offset + 4],
y[offset + 5],
y[offset + 6],
y[offset + 7],
]);
acc += x_vec * y_vec;
}
let arr: [f32; 8] = acc.into();
let mut sum: f32 = arr.iter().sum();
let tail_start = chunks * 8;
for i in 0..remainder {
sum += x[tail_start + i] * y[tail_start + i];
}
sum
}
#[inline]
pub fn scale_f32(a: f32, x: &mut [f32]) {
let n = x.len();
let a_vec = f32x8::splat(a);
let chunks = n / 8;
let remainder = n % 8;
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
let result = a_vec * x_vec;
let arr: [f32; 8] = result.into();
x[offset..offset + 8].copy_from_slice(&arr);
}
let tail_start = chunks * 8;
for i in 0..remainder {
x[tail_start + i] *= a;
}
}
}
impl SimdOps {
#[inline]
pub fn sum_f32(x: &[f32]) -> f32 {
let n = x.len();
let chunks = n / 8;
let remainder = n % 8;
let mut acc = f32x8::splat(0.0);
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
acc += x_vec;
}
let arr: [f32; 8] = acc.into();
let mut sum: f32 = arr.iter().sum();
let tail_start = chunks * 8;
for i in 0..remainder {
sum += x[tail_start + i];
}
sum
}
#[inline]
pub fn sum_f64(x: &[f64]) -> f64 {
let n = x.len();
let chunks = n / 4;
let remainder = n % 4;
let mut acc = f64x4::splat(0.0);
for i in 0..chunks {
let offset = i * 4;
let x_vec = f64x4::new([x[offset], x[offset + 1], x[offset + 2], x[offset + 3]]);
acc += x_vec;
}
let arr: [f64; 4] = acc.into();
let mut sum: f64 = arr.iter().sum();
let tail_start = chunks * 4;
for i in 0..remainder {
sum += x[tail_start + i];
}
sum
}
#[inline]
pub fn max_f32(x: &[f32]) -> f32 {
if x.is_empty() {
return f32::NEG_INFINITY;
}
let n = x.len();
let chunks = n / 8;
let remainder = n % 8;
let mut max_vec = f32x8::splat(f32::NEG_INFINITY);
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
max_vec = max_vec.max(x_vec);
}
let arr: [f32; 8] = max_vec.into();
let mut max_val = arr.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let tail_start = chunks * 8;
for i in 0..remainder {
max_val = max_val.max(x[tail_start + i]);
}
max_val
}
#[inline]
pub fn min_f32(x: &[f32]) -> f32 {
if x.is_empty() {
return f32::INFINITY;
}
let n = x.len();
let chunks = n / 8;
let remainder = n % 8;
let mut min_vec = f32x8::splat(f32::INFINITY);
for i in 0..chunks {
let offset = i * 8;
let x_vec = f32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
min_vec = min_vec.min(x_vec);
}
let arr: [f32; 8] = min_vec.into();
let mut min_val = arr.iter().cloned().fold(f32::INFINITY, f32::min);
let tail_start = chunks * 8;
for i in 0..remainder {
min_val = min_val.min(x[tail_start + i]);
}
min_val
}
#[inline]
pub fn mean_f32(x: &[f32]) -> f32 {
if x.is_empty() {
return 0.0;
}
Self::sum_f32(x) / x.len() as f32
}
}
impl SimdOps {
#[inline]
pub fn laplacian_2d_f32(p: &[f32], laplacian: &mut [f32], width: usize, height: usize) {
let four = f32x8::splat(4.0);
for y in 1..height - 1 {
let row_start = y * width;
let row_above = (y - 1) * width;
let row_below = (y + 1) * width;
let inner_width = width - 2;
let chunks = inner_width / 8;
let remainder = inner_width % 8;
for chunk in 0..chunks {
let x = 1 + chunk * 8;
let idx = row_start + x;
let center = f32x8::new([
p[idx],
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
p[idx + 7],
]);
let north_idx = row_above + x;
let north = f32x8::new([
p[north_idx],
p[north_idx + 1],
p[north_idx + 2],
p[north_idx + 3],
p[north_idx + 4],
p[north_idx + 5],
p[north_idx + 6],
p[north_idx + 7],
]);
let south_idx = row_below + x;
let south = f32x8::new([
p[south_idx],
p[south_idx + 1],
p[south_idx + 2],
p[south_idx + 3],
p[south_idx + 4],
p[south_idx + 5],
p[south_idx + 6],
p[south_idx + 7],
]);
let west = f32x8::new([
p[idx - 1],
p[idx],
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
]);
let east = f32x8::new([
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
p[idx + 7],
p[idx + 8],
]);
let result = north + south + west + east - four * center;
let arr: [f32; 8] = result.into();
laplacian[idx..idx + 8].copy_from_slice(&arr);
}
let tail_start = 1 + chunks * 8;
for i in 0..remainder {
let x = tail_start + i;
let idx = row_start + x;
laplacian[idx] =
p[row_above + x] + p[row_below + x] + p[idx - 1] + p[idx + 1] - 4.0 * p[idx];
}
}
}
#[inline]
pub fn fdtd_step_2d_f32(p: &[f32], p_prev: &mut [f32], c2: f32, width: usize, height: usize) {
let two = f32x8::splat(2.0);
let four = f32x8::splat(4.0);
let c2_vec = f32x8::splat(c2);
for y in 1..height - 1 {
let row_start = y * width;
let row_above = (y - 1) * width;
let row_below = (y + 1) * width;
let inner_width = width - 2;
let chunks = inner_width / 8;
let remainder = inner_width % 8;
for chunk in 0..chunks {
let x = 1 + chunk * 8;
let idx = row_start + x;
let center = f32x8::new([
p[idx],
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
p[idx + 7],
]);
let prev = f32x8::new([
p_prev[idx],
p_prev[idx + 1],
p_prev[idx + 2],
p_prev[idx + 3],
p_prev[idx + 4],
p_prev[idx + 5],
p_prev[idx + 6],
p_prev[idx + 7],
]);
let north_idx = row_above + x;
let north = f32x8::new([
p[north_idx],
p[north_idx + 1],
p[north_idx + 2],
p[north_idx + 3],
p[north_idx + 4],
p[north_idx + 5],
p[north_idx + 6],
p[north_idx + 7],
]);
let south_idx = row_below + x;
let south = f32x8::new([
p[south_idx],
p[south_idx + 1],
p[south_idx + 2],
p[south_idx + 3],
p[south_idx + 4],
p[south_idx + 5],
p[south_idx + 6],
p[south_idx + 7],
]);
let west = f32x8::new([
p[idx - 1],
p[idx],
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
]);
let east = f32x8::new([
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
p[idx + 7],
p[idx + 8],
]);
let laplacian = north + south + west + east - four * center;
let result = two * center - prev + c2_vec * laplacian;
let arr: [f32; 8] = result.into();
p_prev[idx..idx + 8].copy_from_slice(&arr);
}
let tail_start = 1 + chunks * 8;
for i in 0..remainder {
let x = tail_start + i;
let idx = row_start + x;
let laplacian =
p[row_above + x] + p[row_below + x] + p[idx - 1] + p[idx + 1] - 4.0 * p[idx];
p_prev[idx] = 2.0 * p[idx] - p_prev[idx] + c2 * laplacian;
}
}
}
#[inline]
pub fn laplacian_3d_f32(
p: &[f32],
laplacian: &mut [f32],
width: usize,
height: usize,
depth: usize,
) {
let stride_y = width;
let stride_z = width * height;
let six = f32x8::splat(6.0);
for z in 1..depth - 1 {
for y in 1..height - 1 {
let row_start = z * stride_z + y * stride_y;
let inner_width = width - 2;
let chunks = inner_width / 8;
let remainder = inner_width % 8;
for chunk in 0..chunks {
let x = 1 + chunk * 8;
let idx = row_start + x;
let center = f32x8::new([
p[idx],
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
p[idx + 7],
]);
let west = f32x8::new([
p[idx - 1],
p[idx],
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
]);
let east = f32x8::new([
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
p[idx + 7],
p[idx + 8],
]);
let north_idx = idx - stride_y;
let south_idx = idx + stride_y;
let north = f32x8::new([
p[north_idx],
p[north_idx + 1],
p[north_idx + 2],
p[north_idx + 3],
p[north_idx + 4],
p[north_idx + 5],
p[north_idx + 6],
p[north_idx + 7],
]);
let south = f32x8::new([
p[south_idx],
p[south_idx + 1],
p[south_idx + 2],
p[south_idx + 3],
p[south_idx + 4],
p[south_idx + 5],
p[south_idx + 6],
p[south_idx + 7],
]);
let up_idx = idx - stride_z;
let down_idx = idx + stride_z;
let up = f32x8::new([
p[up_idx],
p[up_idx + 1],
p[up_idx + 2],
p[up_idx + 3],
p[up_idx + 4],
p[up_idx + 5],
p[up_idx + 6],
p[up_idx + 7],
]);
let down = f32x8::new([
p[down_idx],
p[down_idx + 1],
p[down_idx + 2],
p[down_idx + 3],
p[down_idx + 4],
p[down_idx + 5],
p[down_idx + 6],
p[down_idx + 7],
]);
let result = west + east + north + south + up + down - six * center;
let arr: [f32; 8] = result.into();
laplacian[idx..idx + 8].copy_from_slice(&arr);
}
let tail_start = 1 + chunks * 8;
for i in 0..remainder {
let x = tail_start + i;
let idx = row_start + x;
laplacian[idx] = p[idx - 1]
+ p[idx + 1]
+ p[idx - stride_y]
+ p[idx + stride_y]
+ p[idx - stride_z]
+ p[idx + stride_z]
- 6.0 * p[idx];
}
}
}
}
}
impl SimdOps {
pub fn par_saxpy(a: f32, x: &[f32], y: &mut [f32]) {
const CHUNK_SIZE: usize = 4096;
y.par_chunks_mut(CHUNK_SIZE)
.zip(x.par_chunks(CHUNK_SIZE))
.for_each(|(y_chunk, x_chunk)| {
Self::saxpy(a, x_chunk, y_chunk);
});
}
pub fn par_sum_f32(x: &[f32]) -> f32 {
const CHUNK_SIZE: usize = 4096;
x.par_chunks(CHUNK_SIZE).map(Self::sum_f32).sum()
}
pub fn par_fdtd_step_2d_f32(
p: &[f32],
p_prev: &mut [f32],
c2: f32,
width: usize,
height: usize,
) {
p_prev
.par_chunks_mut(width)
.enumerate()
.skip(1)
.take(height - 2)
.for_each(|(y, row)| {
let row_above = (y - 1) * width;
let row_below = (y + 1) * width;
let row_start = y * width;
let two = f32x8::splat(2.0);
let four = f32x8::splat(4.0);
let c2_vec = f32x8::splat(c2);
let inner_width = width - 2;
let chunks = inner_width / 8;
let remainder = inner_width % 8;
for chunk in 0..chunks {
let x = 1 + chunk * 8;
let idx = row_start + x;
let local_x = x;
let center = f32x8::new([
p[idx],
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
p[idx + 7],
]);
let prev = f32x8::new([
row[local_x],
row[local_x + 1],
row[local_x + 2],
row[local_x + 3],
row[local_x + 4],
row[local_x + 5],
row[local_x + 6],
row[local_x + 7],
]);
let north_idx = row_above + x;
let north = f32x8::new([
p[north_idx],
p[north_idx + 1],
p[north_idx + 2],
p[north_idx + 3],
p[north_idx + 4],
p[north_idx + 5],
p[north_idx + 6],
p[north_idx + 7],
]);
let south_idx = row_below + x;
let south = f32x8::new([
p[south_idx],
p[south_idx + 1],
p[south_idx + 2],
p[south_idx + 3],
p[south_idx + 4],
p[south_idx + 5],
p[south_idx + 6],
p[south_idx + 7],
]);
let west = f32x8::new([
p[idx - 1],
p[idx],
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
]);
let east = f32x8::new([
p[idx + 1],
p[idx + 2],
p[idx + 3],
p[idx + 4],
p[idx + 5],
p[idx + 6],
p[idx + 7],
p[idx + 8],
]);
let laplacian = north + south + west + east - four * center;
let result = two * center - prev + c2_vec * laplacian;
let arr: [f32; 8] = result.into();
row[local_x..local_x + 8].copy_from_slice(&arr);
}
let tail_start = 1 + chunks * 8;
for i in 0..remainder {
let x = tail_start + i;
let idx = row_start + x;
let laplacian = p[row_above + x] + p[row_below + x] + p[idx - 1] + p[idx + 1]
- 4.0 * p[idx];
row[x] = 2.0 * p[idx] - row[x] + c2 * laplacian;
}
});
}
}
impl SimdOps {
#[inline]
pub fn sum_i32(x: &[i32]) -> i64 {
let n = x.len();
let chunks = n / 8;
let remainder = n % 8;
let mut acc = i32x8::splat(0);
for i in 0..chunks {
let offset = i * 8;
let x_vec = i32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
acc += x_vec;
}
let arr: [i32; 8] = acc.into();
let mut sum: i64 = arr.iter().map(|&v| v as i64).sum();
let tail_start = chunks * 8;
for i in 0..remainder {
sum += x[tail_start + i] as i64;
}
sum
}
#[inline]
pub fn add_i32(x: &[i32], y: &[i32], z: &mut [i32]) {
let n = x.len().min(y.len()).min(z.len());
let chunks = n / 8;
let remainder = n % 8;
for i in 0..chunks {
let offset = i * 8;
let x_vec = i32x8::new([
x[offset],
x[offset + 1],
x[offset + 2],
x[offset + 3],
x[offset + 4],
x[offset + 5],
x[offset + 6],
x[offset + 7],
]);
let y_vec = i32x8::new([
y[offset],
y[offset + 1],
y[offset + 2],
y[offset + 3],
y[offset + 4],
y[offset + 5],
y[offset + 6],
y[offset + 7],
]);
let result = x_vec + y_vec;
let arr: [i32; 8] = result.into();
z[offset..offset + 8].copy_from_slice(&arr);
}
let tail_start = chunks * 8;
for i in 0..remainder {
z[tail_start + i] = x[tail_start + i] + y[tail_start + i];
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_saxpy() {
let x = vec![1.0f32; 100];
let mut y = vec![2.0f32; 100];
SimdOps::saxpy(3.0, &x, &mut y);
for v in y.iter() {
assert!((v - 5.0).abs() < 1e-6, "Expected 5.0, got {}", v);
}
}
#[test]
fn test_saxpy_unaligned() {
let x = vec![1.0f32; 13]; let mut y = vec![2.0f32; 13];
SimdOps::saxpy(2.0, &x, &mut y);
for v in y.iter() {
assert!((v - 4.0).abs() < 1e-6);
}
}
#[test]
fn test_daxpy() {
let x = vec![1.0f64; 100];
let mut y = vec![2.0f64; 100];
SimdOps::daxpy(3.0, &x, &mut y);
for v in y.iter() {
assert!((v - 5.0).abs() < 1e-10);
}
}
#[test]
fn test_dot_product() {
let x = vec![1.0f32; 100];
let y = vec![2.0f32; 100];
let dot = SimdOps::dot_f32(&x, &y);
assert!((dot - 200.0).abs() < 1e-4);
}
#[test]
fn test_sum() {
let x = vec![1.0f32; 1000];
let sum = SimdOps::sum_f32(&x);
assert!((sum - 1000.0).abs() < 1e-3);
}
#[test]
fn test_max_min() {
let x = vec![1.0f32, -5.0, 3.0, 7.0, -2.0, 4.0, 6.0, 8.0, -1.0];
let max = SimdOps::max_f32(&x);
let min = SimdOps::min_f32(&x);
assert!((max - 8.0).abs() < 1e-6);
assert!((min - (-5.0)).abs() < 1e-6);
}
#[test]
fn test_laplacian_2d() {
let width = 5;
let height = 5;
let mut p = vec![0.0f32; width * height];
p[12] = 1.0;
let mut laplacian = vec![0.0f32; width * height];
SimdOps::laplacian_2d_f32(&p, &mut laplacian, width, height);
assert!((laplacian[12] - (-4.0)).abs() < 1e-6);
assert!((laplacian[11] - 1.0).abs() < 1e-6); assert!((laplacian[13] - 1.0).abs() < 1e-6); assert!((laplacian[7] - 1.0).abs() < 1e-6); assert!((laplacian[17] - 1.0).abs() < 1e-6); }
#[test]
fn test_fdtd_step_2d() {
let width = 10;
let height = 10;
let mut p = vec![0.0f32; width * height];
let mut p_prev = vec![0.0f32; width * height];
p[55] = 1.0;
let c2 = 0.1;
SimdOps::fdtd_step_2d_f32(&p, &mut p_prev, c2, width, height);
assert!((p_prev[55] - 1.6).abs() < 1e-6);
}
#[test]
fn test_par_saxpy() {
let x = vec![1.0f32; 10000];
let mut y = vec![2.0f32; 10000];
SimdOps::par_saxpy(3.0, &x, &mut y);
for v in y.iter() {
assert!((v - 5.0).abs() < 1e-6);
}
}
#[test]
fn test_par_sum() {
let x = vec![1.0f32; 100000];
let sum = SimdOps::par_sum_f32(&x);
assert!((sum - 100000.0).abs() < 1.0); }
#[test]
fn test_sum_i32() {
let x = vec![1i32; 1000];
let sum = SimdOps::sum_i32(&x);
assert_eq!(sum, 1000);
}
#[test]
fn test_add_vectors() {
let x = vec![1.0f32; 100];
let y = vec![2.0f32; 100];
let mut z = vec![0.0f32; 100];
SimdOps::add_f32(&x, &y, &mut z);
for v in z.iter() {
assert!((v - 3.0).abs() < 1e-6);
}
}
#[test]
fn test_scale() {
let mut x = vec![2.0f32; 100];
SimdOps::scale_f32(3.0, &mut x);
for v in x.iter() {
assert!((v - 6.0).abs() < 1e-6);
}
}
}