use crate::image::ImageView;
use multiversion::multiversion;
#[multiversion(targets("x86_64+avx2+fma", "aarch64+neon"))]
pub fn sample_bilinear_v8(img: &ImageView, x: &[f32; 8], y: &[f32; 8], out: &mut [f32; 8]) {
#[cfg(all(
target_arch = "x86_64",
target_feature = "avx2",
target_feature = "fma"
))]
if img.has_simd_padding() {
unsafe {
use std::arch::x86_64::*;
let vx = _mm256_loadu_ps(x.as_ptr());
let vy = _mm256_loadu_ps(y.as_ptr());
let half = _mm256_set1_ps(0.5);
let vx = _mm256_sub_ps(vx, half);
let vy = _mm256_sub_ps(vy, half);
let zero = _mm256_setzero_ps();
let max_x = _mm256_set1_ps((img.width as f32 - 2.0).max(0.0));
let max_y = _mm256_set1_ps((img.height as f32 - 2.0).max(0.0));
let vx_clamped = _mm256_max_ps(zero, _mm256_min_ps(vx, max_x));
let vy_clamped = _mm256_max_ps(zero, _mm256_min_ps(vy, max_y));
let vx_floor = _mm256_floor_ps(vx_clamped);
let vy_floor = _mm256_floor_ps(vy_clamped);
let vix = _mm256_cvtps_epi32(vx_floor);
let viy = _mm256_cvtps_epi32(vy_floor);
let fx = _mm256_sub_ps(vx_clamped, vx_floor);
let fy = _mm256_sub_ps(vy_clamped, vy_floor);
let one = _mm256_set1_ps(1.0);
let inv_fx = _mm256_sub_ps(one, fx);
let inv_fy = _mm256_sub_ps(one, fy);
let stride = _mm256_set1_epi32(img.stride as i32);
let idx_tl = _mm256_add_epi32(_mm256_mullo_epi32(viy, stride), vix);
let idx_tr = _mm256_add_epi32(idx_tl, _mm256_set1_epi32(1));
let idx_bl = _mm256_add_epi32(idx_tl, stride);
let idx_br = _mm256_add_epi32(idx_bl, _mm256_set1_epi32(1));
let base_ptr = img.data.as_ptr() as *const i32;
let gather_to_f32 = |offsets: __m256i| -> __m256 {
let gathered = _mm256_i32gather_epi32(base_ptr, offsets, 1);
let masked = _mm256_and_si256(gathered, _mm256_set1_epi32(0xFF));
_mm256_cvtepi32_ps(masked)
};
let v_tl = gather_to_f32(idx_tl);
let v_tr = gather_to_f32(idx_tr);
let v_bl = gather_to_f32(idx_bl);
let v_br = gather_to_f32(idx_br);
let w_tl = _mm256_mul_ps(inv_fx, inv_fy);
let w_tr = _mm256_mul_ps(fx, inv_fy);
let w_bl = _mm256_mul_ps(inv_fx, fy);
let w_br = _mm256_mul_ps(fx, fy);
let mut res = _mm256_mul_ps(w_tl, v_tl);
res = _mm256_fmadd_ps(w_tr, v_tr, res);
res = _mm256_fmadd_ps(w_bl, v_bl, res);
res = _mm256_fmadd_ps(w_br, v_br, res);
_mm256_storeu_ps(out.as_mut_ptr(), res);
return;
}
}
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
#[allow(unsafe_code)]
unsafe {
use std::arch::aarch64::*;
for chunk in 0..2 {
let offset = chunk * 4;
let vx = vld1q_f32(x.as_ptr().add(offset));
let vy = vld1q_f32(y.as_ptr().add(offset));
let half = vdupq_n_f32(0.5);
let vx = vsubq_f32(vx, half);
let vy = vsubq_f32(vy, half);
let zero = vdupq_n_f32(0.0);
let max_x = vdupq_n_f32((img.width as f32 - 2.0).max(0.0));
let max_y = vdupq_n_f32((img.height as f32 - 2.0).max(0.0));
let vx_clamped = vmaxq_f32(zero, vminq_f32(vx, max_x));
let vy_clamped = vmaxq_f32(zero, vminq_f32(vy, max_y));
let vx_floor = vrndmq_f32(vx_clamped);
let vy_floor = vrndmq_f32(vy_clamped);
let vix = vcvtq_s32_f32(vx_floor);
let viy = vcvtq_s32_f32(vy_floor);
let fx = vsubq_f32(vx_clamped, vx_floor);
let fy = vsubq_f32(vy_clamped, vy_floor);
let one = vdupq_n_f32(1.0);
let inv_fx = vsubq_f32(one, fx);
let inv_fy = vsubq_f32(one, fy);
let mut pix_tl = [0.0f32; 4];
let mut pix_tr = [0.0f32; 4];
let mut pix_bl = [0.0f32; 4];
let mut pix_br = [0.0f32; 4];
let mut vix_arr = [0i32; 4];
let mut viy_arr = [0i32; 4];
vst1q_s32(vix_arr.as_mut_ptr(), vix);
vst1q_s32(viy_arr.as_mut_ptr(), viy);
for i in 0..4 {
let px = vix_arr[i] as usize;
let py = viy_arr[i] as usize;
let stride = img.stride;
let base = py * stride + px;
pix_tl[i] = f32::from(img.data[base]);
pix_tr[i] = f32::from(img.data[base + 1]);
pix_bl[i] = f32::from(img.data[base + stride]);
pix_br[i] = f32::from(img.data[base + stride + 1]);
}
let v_tl = vld1q_f32(pix_tl.as_ptr());
let v_tr = vld1q_f32(pix_tr.as_ptr());
let v_bl = vld1q_f32(pix_bl.as_ptr());
let v_br = vld1q_f32(pix_br.as_ptr());
let w_tl = vmulq_f32(inv_fx, inv_fy);
let w_tr = vmulq_f32(fx, inv_fy);
let w_bl = vmulq_f32(inv_fx, fy);
let w_br = vmulq_f32(fx, fy);
let mut res = vmulq_f32(w_tl, v_tl);
res = vfmaq_f32(res, w_tr, v_tr);
res = vfmaq_f32(res, w_bl, v_bl);
res = vfmaq_f32(res, w_br, v_br);
vst1q_f32(out.as_mut_ptr().add(offset), res);
}
return;
}
#[cfg(not(any(
all(
target_arch = "x86_64",
target_feature = "avx2",
target_feature = "fma"
),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
for i in 0..8 {
out[i] = img.sample_bilinear(f64::from(x[i]), f64::from(y[i])) as f32;
}
}
}