#![allow(clippy::cast_precision_loss)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
use scirs2_core::simd::{
simd_add_f32, simd_clip_f32, simd_dot_f32, simd_mul_f32, simd_round_f32, simd_sub_f32,
};
#[must_use]
pub fn bilinear_resize_simd(
src: &[u8],
src_w: usize,
src_h: usize,
dst_w: usize,
dst_h: usize,
) -> Vec<u8> {
if src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0 {
return Vec::new();
}
let scale_x = src_w as f64 / dst_w as f64;
let scale_y = src_h as f64 / dst_h as f64;
let mut dst = vec![0u8; dst_w * dst_h * 3];
for dy in 0..dst_h {
let sy_f = (dy as f64 + 0.5) * scale_y - 0.5;
let sy0 = sy_f.floor().max(0.0) as usize;
let sy1 = (sy0 + 1).min(src_h - 1);
let fy = (sy_f - sy0 as f64).max(0.0) as f32;
bilinear_row_simd(
src,
&mut dst[dy * dst_w * 3..(dy + 1) * dst_w * 3],
src_w,
dst_w,
sy0,
sy1,
fy,
scale_x,
);
}
dst
}
fn bilinear_row_simd(
src: &[u8],
dst_row: &mut [u8],
src_w: usize,
dst_w: usize,
sy0: usize,
sy1: usize,
fy: f32,
scale_x: f64,
) {
let row0_offset = sy0 * src_w * 3;
let row1_offset = sy1 * src_w * 3;
let mut sx0_vec = Vec::with_capacity(dst_w);
let mut sx1_vec = Vec::with_capacity(dst_w);
let mut fx_vec = Vec::with_capacity(dst_w);
for dx in 0..dst_w {
let sx_f = (dx as f64 + 0.5) * scale_x - 0.5;
let sx0 = sx_f.floor().max(0.0) as usize;
let sx1 = (sx0 + 1).min(src_w - 1);
let fx = (sx_f - sx0 as f64).max(0.0) as f32;
sx0_vec.push(sx0);
sx1_vec.push(sx1);
fx_vec.push(fx);
}
for c in 0..3usize {
let mut p00_vals = Vec::with_capacity(dst_w);
let mut p10_vals = Vec::with_capacity(dst_w);
let mut p01_vals = Vec::with_capacity(dst_w);
let mut p11_vals = Vec::with_capacity(dst_w);
for dx in 0..dst_w {
p00_vals.push(src[row0_offset + sx0_vec[dx] * 3 + c] as f32);
p10_vals.push(src[row0_offset + sx1_vec[dx] * 3 + c] as f32);
p01_vals.push(src[row1_offset + sx0_vec[dx] * 3 + c] as f32);
p11_vals.push(src[row1_offset + sx1_vec[dx] * 3 + c] as f32);
}
let (p00, p10, p01, p11) = (p00_vals, p10_vals, p01_vals, p11_vals);
let fx_arr = fx_vec.clone();
let ones: Vec<f32> = vec![1.0f32; dst_w];
let one_minus_fx = simd_sub_f32_vec(&ones, &fx_arr);
let top_left = simd_mul_f32_vec(&p00, &one_minus_fx);
let top_right = simd_mul_f32_vec(&p10, &fx_arr);
let top = simd_add_f32_vec(&top_left, &top_right);
let bot_left = simd_mul_f32_vec(&p01, &one_minus_fx);
let bot_right = simd_mul_f32_vec(&p11, &fx_arr);
let bot = simd_add_f32_vec(&bot_left, &bot_right);
let fy_arr: Vec<f32> = vec![fy; dst_w];
let one_minus_fy: Vec<f32> = vec![1.0 - fy; dst_w];
let val_top = simd_mul_f32_vec(&top, &one_minus_fy);
let val_bot = simd_mul_f32_vec(&bot, &fy_arr);
let val = simd_add_f32_vec(&val_top, &val_bot);
let rounded = simd_round_f32_vec(&val);
let clamped = simd_clip_f32_vec(&rounded, 0.0, 255.0);
for dx in 0..dst_w {
dst_row[dx * 3 + c] = clamped[dx] as u8;
}
}
}
fn simd_sub_f32_vec(a: &[f32], b: &[f32]) -> Vec<f32> {
use scirs2_core::ndarray::ArrayView1;
let av = ArrayView1::from(a);
let bv = ArrayView1::from(b);
simd_sub_f32(&av, &bv).to_vec()
}
fn simd_mul_f32_vec(a: &[f32], b: &[f32]) -> Vec<f32> {
use scirs2_core::ndarray::ArrayView1;
let av = ArrayView1::from(a);
let bv = ArrayView1::from(b);
simd_mul_f32(&av, &bv).to_vec()
}
fn simd_add_f32_vec(a: &[f32], b: &[f32]) -> Vec<f32> {
use scirs2_core::ndarray::ArrayView1;
let av = ArrayView1::from(a);
let bv = ArrayView1::from(b);
simd_add_f32(&av, &bv).to_vec()
}
fn simd_round_f32_vec(a: &[f32]) -> Vec<f32> {
use scirs2_core::ndarray::ArrayView1;
let av = ArrayView1::from(a);
simd_round_f32(&av).to_vec()
}
fn simd_clip_f32_vec(a: &[f32], lo: f32, hi: f32) -> Vec<f32> {
use scirs2_core::ndarray::ArrayView1;
let av = ArrayView1::from(a);
simd_clip_f32(&av, lo, hi).to_vec()
}
#[must_use]
pub fn horizontal_convolve_simd(src_row: &[f32], kernel: &[f32]) -> Vec<f32> {
if kernel.is_empty() || src_row.is_empty() {
return Vec::new();
}
horizontal_convolve_scirs2(src_row, kernel)
}
fn horizontal_convolve_scirs2(src_row: &[f32], kernel: &[f32]) -> Vec<f32> {
use scirs2_core::ndarray::Array1;
let len = src_row.len();
let klen = kernel.len();
let half = klen / 2;
let kernel_arr = Array1::from_vec(kernel.to_vec());
let mut out = vec![0.0f32; len];
for i in 0..len {
let mut window = Vec::with_capacity(klen);
for ki in 0..klen {
let src_idx =
(i as isize + ki as isize - half as isize).clamp(0, (len - 1) as isize) as usize;
window.push(src_row[src_idx]);
}
let window_arr = Array1::from_vec(window);
out[i] = simd_dot_f32(&window_arr.view(), &kernel_arr.view());
}
out
}
#[must_use]
pub fn separable_filter_pass_simd(
src_row: &[f32],
weights: &[Vec<f32>],
src_offsets: &[usize],
) -> Vec<f32> {
if weights.is_empty() || src_row.is_empty() {
return Vec::new();
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if is_x86_feature_detected!("avx2") {
return unsafe { separable_filter_pass_avx2(src_row, weights, src_offsets) };
}
}
separable_filter_pass_scalar(src_row, weights, src_offsets)
}
fn separable_filter_pass_scalar(
src_row: &[f32],
weights: &[Vec<f32>],
src_offsets: &[usize],
) -> Vec<f32> {
let n = weights.len();
let src_len = src_row.len();
let mut out = vec![0.0f32; n];
for j in 0..n {
let ws = &weights[j];
let base = if j < src_offsets.len() {
src_offsets[j]
} else {
0
};
let mut acc = 0.0f32;
for (k, &w) in ws.iter().enumerate() {
let idx = (base + k).min(src_len.saturating_sub(1));
acc += src_row[idx] * w;
}
out[j] = acc;
}
out
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2,fma")]
unsafe fn separable_filter_pass_avx2(
src_row: &[f32],
weights: &[Vec<f32>],
src_offsets: &[usize],
) -> Vec<f32> {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let n = weights.len();
let src_len = src_row.len();
let mut out = vec![0.0f32; n];
for j in 0..n {
let ws = &weights[j];
let base = if j < src_offsets.len() {
src_offsets[j]
} else {
0
};
let num_taps = ws.len();
let mut acc = _mm256_setzero_ps();
let full_chunks = num_taps / 8;
for chunk in 0..full_chunks {
let tap_off = chunk * 8;
let wptr = ws.as_ptr().add(tap_off);
let w8 = _mm256_loadu_ps(wptr);
let mut src8 = [0.0f32; 8];
for k in 0..8 {
let idx = (base + tap_off + k).min(src_len.saturating_sub(1));
src8[k] = src_row[idx];
}
let s8 = _mm256_loadu_ps(src8.as_ptr());
acc = _mm256_fmadd_ps(w8, s8, acc);
}
let lo = _mm256_castps256_ps128(acc);
let hi = _mm256_extractf128_ps(acc, 1);
let sum4 = _mm_add_ps(lo, hi);
let sum2 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4));
let sum1 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 1));
let mut scalar = _mm_cvtss_f32(sum1);
for k in (full_chunks * 8)..num_taps {
let idx = (base + k).min(src_len.saturating_sub(1));
scalar += src_row[idx] * ws[k];
}
out[j] = scalar;
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bilinear_resize_identity() {
let src = vec![100u8; 4 * 4 * 3];
let dst = bilinear_resize_simd(&src, 4, 4, 4, 4);
assert_eq!(dst.len(), 4 * 4 * 3);
for &v in &dst {
assert!((v as i32 - 100).abs() <= 1, "got {v}");
}
}
#[test]
fn test_bilinear_resize_upscale() {
let src = vec![128u8; 4 * 4 * 3];
let dst = bilinear_resize_simd(&src, 4, 4, 16, 16);
assert_eq!(dst.len(), 16 * 16 * 3);
for &v in &dst {
assert!((v as i32 - 128).abs() <= 1, "got {v}");
}
}
#[test]
fn test_bilinear_resize_downscale() {
let src = vec![200u8; 8 * 8 * 3];
let dst = bilinear_resize_simd(&src, 8, 8, 4, 4);
assert_eq!(dst.len(), 4 * 4 * 3);
for &v in &dst {
assert!((v as i32 - 200).abs() <= 2, "got {v}");
}
}
#[test]
fn test_bilinear_resize_zero_dim() {
assert!(bilinear_resize_simd(&[], 0, 4, 4, 4).is_empty());
assert!(bilinear_resize_simd(&[0; 48], 4, 4, 0, 4).is_empty());
}
#[test]
fn test_bilinear_simd_matches_scalar() {
let src: Vec<u8> = (0..64 * 3).map(|i| (i * 7 % 256) as u8).collect();
let src_w = 8;
let src_h = 8;
let dst_w = 12;
let dst_h = 12;
let result = bilinear_resize_simd(&src, src_w, src_h, dst_w, dst_h);
let scale_x = src_w as f64 / dst_w as f64;
let scale_y = src_h as f64 / dst_h as f64;
let mut scalar_result = vec![0u8; dst_w * dst_h * 3];
for dy in 0..dst_h {
let sy_f = (dy as f64 + 0.5) * scale_y - 0.5;
let sy0 = sy_f.floor().max(0.0) as usize;
let sy1 = (sy0 + 1).min(src_h - 1);
let fy = (sy_f - sy0 as f64).max(0.0) as f32;
let row0 = sy0 * src_w * 3;
let row1 = sy1 * src_w * 3;
for dx in 0..dst_w {
let sx_f = (dx as f64 + 0.5) * scale_x - 0.5;
let sx0 = sx_f.floor().max(0.0) as usize;
let sx1 = (sx0 + 1).min(src_w - 1);
let fx = (sx_f - sx0 as f64).max(0.0) as f32;
let base_dst = dy * dst_w * 3 + dx * 3;
for c in 0..3 {
let p00 = src[row0 + sx0 * 3 + c] as f32;
let p10 = src[row0 + sx1 * 3 + c] as f32;
let p01 = src[row1 + sx0 * 3 + c] as f32;
let p11 = src[row1 + sx1 * 3 + c] as f32;
let top = p00 + fx * (p10 - p00);
let bot = p01 + fx * (p11 - p01);
let val = top + fy * (bot - top);
scalar_result[base_dst + c] = val.round().clamp(0.0, 255.0) as u8;
}
}
}
for (i, (&s, &r)) in result.iter().zip(scalar_result.iter()).enumerate() {
assert!(
(s as i32 - r as i32).abs() <= 1,
"mismatch at byte {i}: simd={s}, scalar={r}"
);
}
}
#[test]
fn test_horizontal_convolve_identity() {
let src: Vec<f32> = (0..16).map(|i| i as f32).collect();
let kernel = vec![0.0, 1.0, 0.0];
let result = horizontal_convolve_simd(&src, &kernel);
assert_eq!(result.len(), 16);
for (i, (&got, &want)) in result.iter().zip(src.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-5,
"mismatch at {i}: got {got}, want {want}"
);
}
}
#[test]
fn test_horizontal_convolve_box3() {
let src = vec![0.0, 3.0, 6.0, 9.0, 12.0];
let kernel = vec![1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0];
let result = horizontal_convolve_simd(&src, &kernel);
assert_eq!(result.len(), 5);
assert!((result[2] - 6.0).abs() < 1e-4);
}
#[test]
fn test_horizontal_convolve_simd_matches_scalar() {
let src: Vec<f32> = (0..33).map(|i| (i * 7 % 100) as f32 * 0.1).collect();
let kernel = vec![0.1, 0.2, 0.4, 0.2, 0.1];
let simd_result = horizontal_convolve_simd(&src, &kernel);
let len = src.len();
let klen = kernel.len();
let half = klen / 2;
let mut scalar_result = vec![0.0f32; len];
for i in 0..len {
let mut acc = 0.0f32;
for (ki, &kw) in kernel.iter().enumerate() {
let src_idx = (i as isize + ki as isize - half as isize)
.clamp(0, (len - 1) as isize) as usize;
acc += src[src_idx] * kw;
}
scalar_result[i] = acc;
}
for (i, (&s, &r)) in simd_result.iter().zip(scalar_result.iter()).enumerate() {
assert!(
(s - r).abs() < 1e-4,
"mismatch at {i}: simd={s}, scalar={r}"
);
}
}
#[test]
fn test_horizontal_convolve_empty() {
assert!(horizontal_convolve_simd(&[], &[1.0]).is_empty());
assert!(horizontal_convolve_simd(&[1.0], &[]).is_empty());
}
#[test]
fn test_separable_filter_pass_identity() {
let src: Vec<f32> = (0..8).map(|i| i as f32).collect();
let weights: Vec<Vec<f32>> = (0..8).map(|_| vec![1.0]).collect();
let offsets: Vec<usize> = (0..8).collect();
let out = separable_filter_pass_simd(&src, &weights, &offsets);
assert_eq!(out.len(), 8);
for (i, (&got, &want)) in out.iter().zip(src.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-5,
"mismatch at {i}: {got} vs {want}"
);
}
}
#[test]
fn test_separable_filter_pass_average2() {
let src = vec![0.0f32, 2.0, 4.0, 6.0, 8.0];
let weights: Vec<Vec<f32>> = (0..4).map(|_| vec![0.5, 0.5]).collect();
let offsets: Vec<usize> = (0..4).collect();
let out = separable_filter_pass_simd(&src, &weights, &offsets);
assert_eq!(out.len(), 4);
let expected = [1.0f32, 3.0, 5.0, 7.0];
for (i, (&got, &want)) in out.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-5,
"mismatch at {i}: {got} vs {want}"
);
}
}
#[test]
fn test_separable_filter_pass_empty() {
let out = separable_filter_pass_simd(&[], &[], &[]);
assert!(out.is_empty());
}
#[test]
fn test_separable_filter_pass_scalar_matches_avx2() {
let src: Vec<f32> = (0..32).map(|i| (i * 3 % 17) as f32 * 0.5).collect();
let weights: Vec<Vec<f32>> = (0..24)
.map(|j| {
let center = j as f32;
(0..8)
.map(|k| {
let x = (k as f32 - 3.5).abs();
(-(x * x) / (2.0 * center.max(1.0) * 0.5)).exp()
})
.collect()
})
.collect();
let weights: Vec<Vec<f32>> = weights
.into_iter()
.map(|w| {
let sum: f32 = w.iter().sum();
if sum > 1e-6 {
w.iter().map(|&v| v / sum).collect()
} else {
w
}
})
.collect();
let offsets: Vec<usize> = (0..24).collect();
let scalar_out = separable_filter_pass_scalar(&src, &weights, &offsets);
let simd_out = separable_filter_pass_simd(&src, &weights, &offsets);
assert_eq!(scalar_out.len(), simd_out.len());
for (i, (&s, &v)) in scalar_out.iter().zip(simd_out.iter()).enumerate() {
assert!(
(s - v).abs() < 1e-4,
"scalar/simd mismatch at {i}: scalar={s} simd={v}"
);
}
}
}