#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use crate::ops::BinaryOp;
const I32_LANES: usize = 8;
macro_rules! impl_binary_i32_avx2 {
($name:ident, $vec_op:ident) => {
#[target_feature(enable = "avx2")]
unsafe fn $name(a: *const i32, b: *const i32, out: *mut i32, chunks: usize) {
for i in 0..chunks {
let offset = i * I32_LANES;
let va = _mm256_loadu_si256(a.add(offset) as *const __m256i);
let vb = _mm256_loadu_si256(b.add(offset) as *const __m256i);
let vr = $vec_op(va, vb);
_mm256_storeu_si256(out.add(offset) as *mut __m256i, vr);
}
}
};
}
impl_binary_i32_avx2!(binary_add_i32, _mm256_add_epi32);
impl_binary_i32_avx2!(binary_sub_i32, _mm256_sub_epi32);
impl_binary_i32_avx2!(binary_mul_i32, _mm256_mullo_epi32);
impl_binary_i32_avx2!(binary_max_i32, _mm256_max_epi32);
impl_binary_i32_avx2!(binary_min_i32, _mm256_min_epi32);
#[target_feature(enable = "avx2")]
pub unsafe fn binary_i32(op: BinaryOp, a: *const i32, b: *const i32, out: *mut i32, len: usize) {
let chunks = len / I32_LANES;
let remainder = len % I32_LANES;
match op {
BinaryOp::Add => binary_add_i32(a, b, out, chunks),
BinaryOp::Sub => binary_sub_i32(a, b, out, chunks),
BinaryOp::Mul => binary_mul_i32(a, b, out, chunks),
BinaryOp::Max => binary_max_i32(a, b, out, chunks),
BinaryOp::Min => binary_min_i32(a, b, out, chunks),
_ => {
super::super::binary_scalar_i32(op, a, b, out, len);
return;
}
}
if remainder > 0 {
let offset = chunks * I32_LANES;
super::super::binary_scalar_i32(
op,
a.add(offset),
b.add(offset),
out.add(offset),
remainder,
);
}
}