use crate::image::{BufferPool, ImageF};
#[must_use]
pub fn compute_separable5_weights(sigma: f32) -> [f32; 3] {
let mut buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut buf);
assert_eq!(kernel.len(), 5, "Separable5 requires kernel size 5");
let sum: f32 = kernel.iter().sum();
let scale = 1.0 / sum;
[
kernel[2] * scale, kernel[1] * scale, kernel[0] * scale, ]
}
#[must_use]
pub fn compute_kernel(sigma: f32) -> Vec<f32> {
let mut buf = [0.0f32; MAX_KERNEL_SIZE];
let slice = compute_kernel_stack(sigma, &mut buf);
slice.to_vec()
}
#[inline]
fn compute_kernel_stack(sigma: f32, buf: &mut [f32; MAX_KERNEL_SIZE]) -> &[f32] {
const M: f32 = 2.25;
let scaler = -1.0 / (2.0 * sigma * sigma);
let diff = (M * sigma.abs()).max(1.0) as i32;
let size = (2 * diff + 1) as usize;
debug_assert!(size <= MAX_KERNEL_SIZE);
for i in -diff..=diff {
let weight = (scaler * (i * i) as f32).exp();
buf[(i + diff) as usize] = weight;
}
&buf[..size]
}
#[allow(clippy::inline_always)]
#[inline(always)]
fn convolve_horizontal_borders(
input: &ImageF,
kernel: &[f32],
scaled_kernel: &[f32],
border_ratio: f32,
output: &mut ImageF,
include_interior: bool,
) {
let width = input.width();
let height = input.height();
let half = kernel.len() / 2;
let weight_no_border: f32 = kernel.iter().sum();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
for y in 0..height {
let row_in = input.row(y);
let row_out = output.row_mut(y);
for x in 0..border1 {
let minx = x.saturating_sub(half);
let maxx = (x + half).min(width - 1);
let k_start = minx + half - x;
let k_end = maxx + half - x + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
let sum: f32 = row_in[minx..minx + ks.len()]
.iter()
.zip(ks)
.map(|(&r, &k)| r * k)
.sum::<f32>()
* scale;
row_out[x] = sum;
}
if include_interior {
for x in border1..border2 {
let d = x - half;
let base = &row_in[d..d + scaled_kernel.len()];
let sum: f32 = base.iter().zip(scaled_kernel).map(|(&r, &k)| r * k).sum();
row_out[x] = sum;
}
}
for x in border2..width {
let minx = x.saturating_sub(half);
let maxx = (x + half).min(width - 1);
let k_start = minx + half - x;
let k_end = maxx + half - x + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
let sum: f32 = row_in[minx..minx + ks.len()]
.iter()
.zip(ks)
.map(|(&r, &k)| r * k)
.sum::<f32>()
* scale;
row_out[x] = sum;
}
}
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
fn convolve_horizontal_interior_v3(
token: archmage::X64V3Token,
input: &ImageF,
scaled_kernel: &[f32],
border1: usize,
border2: usize,
half: usize,
output: &mut ImageF,
) {
use magetypes::simd::f32x8;
let height = input.height();
let kernel_len = scaled_kernel.len();
let simd_chunks = (border2 - border1) / 8;
let simd_end = border1 + simd_chunks * 8;
for y in 0..height {
let row_in = input.row(y);
let row_out = output.row_mut(y);
for chunk_idx in 0..simd_chunks {
let x = border1 + chunk_idx * 8;
let d = x - half;
let base = &row_in[d..d + kernel_len + 7];
let mut sum = f32x8::zero(token);
for (j, &k) in scaled_kernel.iter().enumerate() {
let loaded = f32x8::from_slice(token, &base[j..]);
sum = loaded.mul_add(f32x8::splat(token, k), sum);
}
let results = sum.to_array();
row_out[x..x + 8].copy_from_slice(&results);
}
for x in simd_end..border2 {
let d = x - half;
let base = &row_in[d..d + kernel_len];
let sum: f32 = base
.iter()
.zip(scaled_kernel)
.fold(0.0f32, |acc, (&r, &k)| r.mul_add(k, acc));
row_out[x] = sum;
}
}
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
fn convolve_horizontal_interior_v4(
token: archmage::X64V4Token,
input: &ImageF,
scaled_kernel: &[f32],
border1: usize,
border2: usize,
half: usize,
output: &mut ImageF,
) {
use magetypes::simd::v4::f32x16;
let height = input.height();
let kernel_len = scaled_kernel.len();
let simd_chunks = (border2 - border1) / 16;
let simd_end = border1 + simd_chunks * 16;
for y in 0..height {
let row_in = input.row(y);
let row_out = output.row_mut(y);
for chunk_idx in 0..simd_chunks {
let x = border1 + chunk_idx * 16;
let d = x - half;
let base = &row_in[d..d + kernel_len + 15];
let mut sum = f32x16::zero(token);
for (j, &k) in scaled_kernel.iter().enumerate() {
let loaded = f32x16::from_slice(token, &base[j..]);
sum = loaded.mul_add(f32x16::splat(token, k), sum);
}
let results = sum.to_array();
row_out[x..x + 16].copy_from_slice(&results);
}
for x in simd_end..border2 {
let d = x - half;
let base = &row_in[d..d + kernel_len];
let sum: f32 = base
.iter()
.zip(scaled_kernel)
.fold(0.0f32, |acc, (&r, &k)| r.mul_add(k, acc));
row_out[x] = sum;
}
}
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn convolve_horizontal_interior_neon(
token: archmage::NeonToken,
input: &ImageF,
scaled_kernel: &[f32],
border1: usize,
border2: usize,
half: usize,
output: &mut ImageF,
) {
use magetypes::simd::f32x8;
let height = input.height();
let kernel_len = scaled_kernel.len();
let simd_chunks = (border2 - border1) / 8;
let simd_end = border1 + simd_chunks * 8;
for y in 0..height {
let row_in = input.row(y);
let row_out = output.row_mut(y);
for chunk_idx in 0..simd_chunks {
let x = border1 + chunk_idx * 8;
let d = x - half;
let base = &row_in[d..d + kernel_len + 7];
let mut sum = f32x8::zero(token);
for (j, &k) in scaled_kernel.iter().enumerate() {
let loaded = f32x8::load(token, (&base[j..j + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, k), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
}
for x in simd_end..border2 {
let d = x - half;
let base = &row_in[d..d + kernel_len];
let sum: f32 = base
.iter()
.zip(scaled_kernel)
.fold(0.0f32, |acc, (&r, &k)| r.mul_add(k, acc));
row_out[x] = sum;
}
}
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn convolve_horizontal_interior_wasm128(
token: archmage::Wasm128Token,
input: &ImageF,
scaled_kernel: &[f32],
border1: usize,
border2: usize,
half: usize,
output: &mut ImageF,
) {
use magetypes::simd::f32x8;
let height = input.height();
let kernel_len = scaled_kernel.len();
let simd_chunks = (border2 - border1) / 8;
let simd_end = border1 + simd_chunks * 8;
for y in 0..height {
let row_in = input.row(y);
let row_out = output.row_mut(y);
for chunk_idx in 0..simd_chunks {
let x = border1 + chunk_idx * 8;
let d = x - half;
let base = &row_in[d..d + kernel_len + 7];
let mut sum = f32x8::zero(token);
for (j, &k) in scaled_kernel.iter().enumerate() {
let loaded = f32x8::load(token, (&base[j..j + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, k), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
}
for x in simd_end..border2 {
let d = x - half;
let base = &row_in[d..d + kernel_len];
let sum: f32 = base
.iter()
.zip(scaled_kernel)
.fold(0.0f32, |acc, (&r, &k)| r.mul_add(k, acc));
row_out[x] = sum;
}
}
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
fn convolve_vertical_v3(
token: archmage::X64V3Token,
input: &ImageF,
kernel: &[f32],
scaled_kernel: &[f32],
border_ratio: f32,
output: &mut ImageF,
) {
use magetypes::simd::f32x8;
let width = input.width();
let height = input.height();
let half = kernel.len() / 2;
let weight_no_border: f32 = kernel.iter().sum();
let border_top = half.min(height);
let border_bottom = if height > half { height - half } else { 0 };
let simd_width = (width / 8) * 8;
for y in 0..border_top {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
let row_out = output.row_mut(y);
let mut x = 0;
while x + 8 <= simd_width {
let mut sum = f32x8::zero(token);
for (ki, &kw) in ks.iter().enumerate() {
let src_y = miny + ki;
let row_in = input.row(src_y);
let loaded = f32x8::load(token, (&row_in[x..x + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, kw * scale), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
let mut sum = 0.0f32;
for (ki, &kw) in ks.iter().enumerate() {
sum += input.row(miny + ki)[x] * kw;
}
row_out[x] = sum * scale;
x += 1;
}
}
let in_data = input.data();
let in_stride = input.stride();
for y in border_top..border_bottom {
let start_y = y - half;
let row_out = output.row_mut(y);
let mut x = 0;
while x + 8 <= simd_width {
let base0 = start_y * in_stride + x;
let mut sum = f32x8::load(token, (&in_data[base0..base0 + 8]).try_into().unwrap())
* f32x8::splat(token, scaled_kernel[0]);
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
let base = base0 + ki * in_stride;
let loaded = f32x8::load(token, (&in_data[base..base + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, kw), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
x += 8;
}
for x in simd_width..width {
let mut sum = in_data[start_y * in_stride + x] * scaled_kernel[0];
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
sum = in_data[(start_y + ki) * in_stride + x].mul_add(kw, sum);
}
row_out[x] = sum;
}
}
for y in border_bottom..height {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
let row_out = output.row_mut(y);
let mut x = 0;
while x + 8 <= simd_width {
let mut sum = f32x8::zero(token);
for (ki, &kw) in ks.iter().enumerate() {
let src_y = miny + ki;
let row_in = input.row(src_y);
let loaded = f32x8::load(token, (&row_in[x..x + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, kw * scale), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
let mut sum = 0.0f32;
for (ki, &kw) in ks.iter().enumerate() {
sum += input.row(miny + ki)[x] * kw;
}
row_out[x] = sum * scale;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
fn convolve_vertical_v4(
token: archmage::X64V4Token,
input: &ImageF,
kernel: &[f32],
scaled_kernel: &[f32],
border_ratio: f32,
output: &mut ImageF,
) {
use magetypes::simd::v4::f32x16;
let width = input.width();
let height = input.height();
let half = kernel.len() / 2;
let weight_no_border: f32 = kernel.iter().sum();
let border_top = half.min(height);
let border_bottom = if height > half { height - half } else { 0 };
let simd_width = (width / 16) * 16;
let process_border_row =
|y: usize, miny: usize, ks: &[f32], scale: f32, row_out: &mut [f32]| {
let mut x = 0;
while x + 16 <= simd_width {
let mut sum = f32x16::zero(token);
for (ki, &kw) in ks.iter().enumerate() {
let row_in = input.row(miny + ki);
let loaded = f32x16::from_slice(token, &row_in[x..]);
sum = loaded.mul_add(f32x16::splat(token, kw * scale), sum);
}
let results = sum.to_array();
row_out[x..x + 16].copy_from_slice(&results);
x += 16;
}
while x < width {
let mut sum = 0.0f32;
for (ki, &kw) in ks.iter().enumerate() {
sum += input.row(miny + ki)[x] * kw;
}
row_out[x] = sum * scale;
x += 1;
}
let _ = y; };
for y in 0..border_top {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
let row_out = output.row_mut(y);
process_border_row(y, miny, ks, scale, row_out);
}
let simd16_width = (width / 16) * 16;
let in_data = input.data();
let in_stride = input.stride();
for y in border_top..border_bottom {
let start_y = y - half;
let row_out = output.row_mut(y);
let mut x = 0;
while x + 16 <= simd16_width {
let base0 = start_y * in_stride + x;
let mut sum = f32x16::from_slice(token, &in_data[base0..])
* f32x16::splat(token, scaled_kernel[0]);
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
let base = base0 + ki * in_stride;
let loaded = f32x16::from_slice(token, &in_data[base..]);
sum = loaded.mul_add(f32x16::splat(token, kw), sum);
}
row_out[x..x + 16].copy_from_slice(&sum.to_array());
x += 16;
}
for x in simd16_width..width {
let mut sum = in_data[start_y * in_stride + x] * scaled_kernel[0];
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
sum = in_data[(start_y + ki) * in_stride + x].mul_add(kw, sum);
}
row_out[x] = sum;
}
}
for y in border_bottom..height {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
let row_out = output.row_mut(y);
process_border_row(y, miny, ks, scale, row_out);
}
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn convolve_vertical_neon(
token: archmage::NeonToken,
input: &ImageF,
kernel: &[f32],
scaled_kernel: &[f32],
border_ratio: f32,
output: &mut ImageF,
) {
use magetypes::simd::f32x8;
let width = input.width();
let height = input.height();
let half = kernel.len() / 2;
let weight_no_border: f32 = kernel.iter().sum();
let border_top = half.min(height);
let border_bottom = if height > half { height - half } else { 0 };
let simd_width = (width / 8) * 8;
let process_border_row = |miny: usize, ks: &[f32], scale: f32, row_out: &mut [f32]| {
let mut x = 0;
while x + 8 <= simd_width {
let mut sum = f32x8::zero(token);
for (ki, &kw) in ks.iter().enumerate() {
let row_in = input.row(miny + ki);
let loaded = f32x8::load(token, (&row_in[x..x + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, kw * scale), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
let mut sum = 0.0f32;
for (ki, &kw) in ks.iter().enumerate() {
sum += input.row(miny + ki)[x] * kw;
}
row_out[x] = sum * scale;
x += 1;
}
};
for y in 0..border_top {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
process_border_row(miny, ks, scale, output.row_mut(y));
}
let in_data = input.data();
let in_stride = input.stride();
for y in border_top..border_bottom {
let start_y = y - half;
let row_out = output.row_mut(y);
let mut x = 0;
while x + 8 <= simd_width {
let base0 = start_y * in_stride + x;
let mut sum = f32x8::load(token, (&in_data[base0..base0 + 8]).try_into().unwrap())
* f32x8::splat(token, scaled_kernel[0]);
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
let base = base0 + ki * in_stride;
let loaded = f32x8::load(token, (&in_data[base..base + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, kw), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
x += 8;
}
for x in simd_width..width {
let mut sum = in_data[start_y * in_stride + x] * scaled_kernel[0];
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
sum = in_data[(start_y + ki) * in_stride + x].mul_add(kw, sum);
}
row_out[x] = sum;
}
}
for y in border_bottom..height {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
process_border_row(miny, ks, scale, output.row_mut(y));
}
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn convolve_vertical_wasm128(
token: archmage::Wasm128Token,
input: &ImageF,
kernel: &[f32],
scaled_kernel: &[f32],
border_ratio: f32,
output: &mut ImageF,
) {
use magetypes::simd::f32x8;
let width = input.width();
let height = input.height();
let half = kernel.len() / 2;
let weight_no_border: f32 = kernel.iter().sum();
let border_top = half.min(height);
let border_bottom = if height > half { height - half } else { 0 };
let simd_width = (width / 8) * 8;
let process_border_row = |miny: usize, ks: &[f32], scale: f32, row_out: &mut [f32]| {
let mut x = 0;
while x + 8 <= simd_width {
let mut sum = f32x8::zero(token);
for (ki, &kw) in ks.iter().enumerate() {
let row_in = input.row(miny + ki);
let loaded = f32x8::load(token, (&row_in[x..x + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, kw * scale), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
let mut sum = 0.0f32;
for (ki, &kw) in ks.iter().enumerate() {
sum += input.row(miny + ki)[x] * kw;
}
row_out[x] = sum * scale;
x += 1;
}
};
for y in 0..border_top {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
process_border_row(miny, ks, scale, output.row_mut(y));
}
let in_data = input.data();
let in_stride = input.stride();
for y in border_top..border_bottom {
let start_y = y - half;
let row_out = output.row_mut(y);
let mut x = 0;
while x + 8 <= simd_width {
let base0 = start_y * in_stride + x;
let mut sum = f32x8::load(token, (&in_data[base0..base0 + 8]).try_into().unwrap())
* f32x8::splat(token, scaled_kernel[0]);
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
let base = base0 + ki * in_stride;
let loaded = f32x8::load(token, (&in_data[base..base + 8]).try_into().unwrap());
sum = loaded.mul_add(f32x8::splat(token, kw), sum);
}
sum.store((&mut row_out[x..x + 8]).try_into().unwrap());
x += 8;
}
for x in simd_width..width {
let mut sum = in_data[start_y * in_stride + x] * scaled_kernel[0];
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
sum = in_data[(start_y + ki) * in_stride + x].mul_add(kw, sum);
}
row_out[x] = sum;
}
}
for y in border_bottom..height {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
process_border_row(miny, ks, scale, output.row_mut(y));
}
}
fn convolve_vertical_scalar(
input: &ImageF,
kernel: &[f32],
scaled_kernel: &[f32],
border_ratio: f32,
output: &mut ImageF,
) {
let width = input.width();
let height = input.height();
let half = kernel.len() / 2;
let weight_no_border: f32 = kernel.iter().sum();
let border_top = half.min(height);
let border_bottom = if height > half { height - half } else { 0 };
let process_border_row = |miny: usize, ks: &[f32], scale: f32, row_out: &mut [f32]| {
for x in 0..width {
let mut sum = 0.0f32;
for (ki, &kw) in ks.iter().enumerate() {
sum += input.row(miny + ki)[x] * kw;
}
row_out[x] = sum * scale;
}
};
for y in 0..border_top {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
process_border_row(miny, ks, scale, output.row_mut(y));
}
let in_data = input.data();
let in_stride = input.stride();
for y in border_top..border_bottom {
let start_y = y - half;
let row_out = output.row_mut(y);
for x in 0..width {
let mut sum = in_data[start_y * in_stride + x] * scaled_kernel[0];
for (ki, &kw) in scaled_kernel.iter().enumerate().skip(1) {
sum = in_data[(start_y + ki) * in_stride + x].mul_add(kw, sum);
}
row_out[x] = sum;
}
}
for y in border_bottom..height {
let miny = y.saturating_sub(half);
let maxy = (y + half).min(height - 1);
let k_start = miny + half - y;
let k_end = maxy + half - y + 1;
let ks = &kernel[k_start..k_end];
let weight: f32 = ks.iter().sum();
let effective = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
let scale = 1.0 / effective;
process_border_row(miny, ks, scale, output.row_mut(y));
}
}
pub fn gaussian_blur(input: &ImageF, sigma: f32, pool: &BufferPool) -> ImageF {
if sigma <= 0.0 {
return input.clone();
}
#[cfg(feature = "iir-blur")]
{
return crate::blur_iir::gaussian_blur_iir(input, sigma, pool);
}
#[cfg(not(feature = "iir-blur"))]
archmage::incant!(
gaussian_blur_dispatch(input, sigma, pool),
[v4, v3, neon, wasm128]
)
}
const MAX_KERNEL_SIZE: usize = 64;
#[inline]
fn compute_scaled_kernel<'a>(kernel: &[f32], buf: &'a mut [f32; MAX_KERNEL_SIZE]) -> &'a [f32] {
debug_assert!(kernel.len() <= MAX_KERNEL_SIZE);
let weight: f32 = kernel.iter().sum();
let inv_weight = 1.0 / weight;
for (i, &k) in kernel.iter().enumerate() {
buf[i] = k * inv_weight;
}
&buf[..kernel.len()]
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn gaussian_blur_dispatch_v4(
token: archmage::X64V4Token,
input: &ImageF,
sigma: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let half = kernel.len() / 2;
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, kernel, scaled, 0.0, &mut temp, false);
if border2 > border1 {
convolve_horizontal_interior_v4(token, input, scaled, border1, border2, half, &mut temp);
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_v4(token, &temp, kernel, scaled, 0.0, &mut output);
temp.recycle(pool);
output
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn gaussian_blur_dispatch_v3(
token: archmage::X64V3Token,
input: &ImageF,
sigma: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let half = kernel.len() / 2;
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, kernel, scaled, 0.0, &mut temp, false);
if border2 > border1 {
convolve_horizontal_interior_v3(token, input, scaled, border1, border2, half, &mut temp);
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_v3(token, &temp, kernel, scaled, 0.0, &mut output);
temp.recycle(pool);
output
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
fn gaussian_blur_dispatch_neon(
token: archmage::NeonToken,
input: &ImageF,
sigma: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let half = kernel.len() / 2;
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(&kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, &kernel, &scaled, 0.0, &mut temp, false);
if border2 > border1 {
convolve_horizontal_interior_neon(token, input, &scaled, border1, border2, half, &mut temp);
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_neon(token, &temp, &kernel, &scaled, 0.0, &mut output);
temp.recycle(pool);
output
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
fn gaussian_blur_dispatch_wasm128(
token: archmage::Wasm128Token,
input: &ImageF,
sigma: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let half = kernel.len() / 2;
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(&kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, &kernel, &scaled, 0.0, &mut temp, false);
if border2 > border1 {
convolve_horizontal_interior_wasm128(
token, input, &scaled, border1, border2, half, &mut temp,
);
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_wasm128(token, &temp, &kernel, &scaled, 0.0, &mut output);
temp.recycle(pool);
output
}
fn gaussian_blur_dispatch_scalar(
_token: archmage::ScalarToken,
input: &ImageF,
sigma: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, kernel, scaled, 0.0, &mut temp, true);
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_scalar(&temp, kernel, scaled, 0.0, &mut output);
temp.recycle(pool);
output
}
pub fn blur_with_border(
input: &ImageF,
sigma: f32,
border_ratio: f32,
pool: &BufferPool,
) -> ImageF {
if sigma <= 0.0 {
return input.clone();
}
archmage::incant!(
blur_with_border_dispatch(input, sigma, border_ratio, pool),
[v4, v3, neon, wasm128]
)
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn blur_with_border_dispatch_v4(
token: archmage::X64V4Token,
input: &ImageF,
sigma: f32,
border_ratio: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let half = kernel.len() / 2;
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, kernel, scaled, border_ratio, &mut temp, false);
if border2 > border1 {
convolve_horizontal_interior_v4(token, input, scaled, border1, border2, half, &mut temp);
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_v4(token, &temp, kernel, scaled, border_ratio, &mut output);
temp.recycle(pool);
output
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn blur_with_border_dispatch_v3(
token: archmage::X64V3Token,
input: &ImageF,
sigma: f32,
border_ratio: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let half = kernel.len() / 2;
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, kernel, scaled, border_ratio, &mut temp, false);
if border2 > border1 {
convolve_horizontal_interior_v3(token, input, scaled, border1, border2, half, &mut temp);
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_v3(token, &temp, kernel, scaled, border_ratio, &mut output);
temp.recycle(pool);
output
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
fn blur_with_border_dispatch_neon(
token: archmage::NeonToken,
input: &ImageF,
sigma: f32,
border_ratio: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let half = kernel.len() / 2;
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(&kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, &kernel, &scaled, border_ratio, &mut temp, false);
if border2 > border1 {
convolve_horizontal_interior_neon(token, input, &scaled, border1, border2, half, &mut temp);
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_neon(token, &temp, &kernel, &scaled, border_ratio, &mut output);
temp.recycle(pool);
output
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
fn blur_with_border_dispatch_wasm128(
token: archmage::Wasm128Token,
input: &ImageF,
sigma: f32,
border_ratio: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let half = kernel.len() / 2;
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(&kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let border1 = half.min(width);
let border2 = if width > half { width - half } else { 0 };
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, &kernel, &scaled, border_ratio, &mut temp, false);
if border2 > border1 {
convolve_horizontal_interior_wasm128(
token, input, &scaled, border1, border2, half, &mut temp,
);
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_wasm128(token, &temp, &kernel, &scaled, border_ratio, &mut output);
temp.recycle(pool);
output
}
fn blur_with_border_dispatch_scalar(
_token: archmage::ScalarToken,
input: &ImageF,
sigma: f32,
border_ratio: f32,
pool: &BufferPool,
) -> ImageF {
let mut kernel_buf = [0.0f32; MAX_KERNEL_SIZE];
let kernel = compute_kernel_stack(sigma, &mut kernel_buf);
let mut scaled_buf = [0.0f32; MAX_KERNEL_SIZE];
let scaled = compute_scaled_kernel(kernel, &mut scaled_buf);
let width = input.width();
let height = input.height();
let mut temp = ImageF::from_pool_dirty(width, height, pool);
convolve_horizontal_borders(input, kernel, scaled, border_ratio, &mut temp, true);
let mut output = ImageF::from_pool_dirty(width, height, pool);
convolve_vertical_scalar(&temp, kernel, scaled, border_ratio, &mut output);
temp.recycle(pool);
output
}
pub fn gaussian_blur_inplace(image: &mut ImageF, sigma: f32, pool: &BufferPool) {
if sigma <= 0.0 {
return;
}
let blurred = gaussian_blur(image, sigma, pool);
image.copy_from(&blurred);
blurred.recycle(pool);
}
#[inline]
fn mirror(mut x: i32, size: i32) -> usize {
while x < 0 || x >= size {
if x < 0 {
x = -x - 1;
} else {
x = 2 * size - 1 - x;
}
}
x as usize
}
pub fn blur_mirrored_5x5(input: &ImageF, weights: &[f32; 3], pool: &BufferPool) -> ImageF {
archmage::incant!(
blur_mirrored_5x5(input, weights, pool),
[v4, v3, neon, wasm128]
)
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn blur_mirrored_5x5_v4(
token: archmage::X64V4Token,
input: &ImageF,
weights: &[f32; 3],
pool: &BufferPool,
) -> ImageF {
use magetypes::simd::v4::f32x16;
let width = input.width();
let height = input.height();
let w0 = weights[0];
let w1 = weights[1];
let w2 = weights[2];
let w0_v = f32x16::splat(token, w0);
let w1_v = f32x16::splat(token, w1);
let w2_v = f32x16::splat(token, w2);
let iwidth = width as i32;
let iheight = height as i32;
let mut temp = ImageF::from_pool_dirty(width, height, pool);
let border = 2.min(width);
let interior_end = if width > 4 { width - 2 } else { 0 };
for y in 0..height {
let row = input.row(y);
let out_row = temp.row_mut(y);
for x in 0..border {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
}
let mut x = border;
while x + 16 <= interior_end {
let v_m2 = f32x16::load(token, (&row[x - 2..x + 14]).try_into().unwrap());
let v_m1 = f32x16::load(token, (&row[x - 1..x + 15]).try_into().unwrap());
let v_0 = f32x16::load(token, (&row[x..x + 16]).try_into().unwrap());
let v_p1 = f32x16::load(token, (&row[x + 1..x + 17]).try_into().unwrap());
let v_p2 = f32x16::load(token, (&row[x + 2..x + 18]).try_into().unwrap());
let sum = v_0 * w0_v + (v_m1 + v_p1) * w1_v + (v_m2 + v_p2) * w2_v;
sum.store((&mut out_row[x..x + 16]).try_into().unwrap());
x += 16;
}
while x < interior_end {
let v_m2 = row[x - 2];
let v_m1 = row[x - 1];
let v_0 = row[x];
let v_p1 = row[x + 1];
let v_p2 = row[x + 2];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
x += 1;
}
for x in interior_end..width {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
}
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
let v_border = 2.min(height);
let v_interior_end = if height > 4 { height - 2 } else { 0 };
for y in 0..v_border {
let iy = y as i32;
let rm2 = temp.row(mirror(iy - 2, iheight));
let rm1 = temp.row(mirror(iy - 1, iheight));
let r0 = temp.row(y);
let rp1 = temp.row(mirror(iy + 1, iheight));
let rp2 = temp.row(mirror(iy + 2, iheight));
let out = output.row_mut(y);
let mut x = 0;
while x + 16 <= width {
let vm2 = f32x16::load(token, (&rm2[x..x + 16]).try_into().unwrap());
let vm1 = f32x16::load(token, (&rm1[x..x + 16]).try_into().unwrap());
let v0 = f32x16::load(token, (&r0[x..x + 16]).try_into().unwrap());
let vp1 = f32x16::load(token, (&rp1[x..x + 16]).try_into().unwrap());
let vp2 = f32x16::load(token, (&rp2[x..x + 16]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 16]).try_into().unwrap());
x += 16;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
for y in v_border..v_interior_end {
let rm2 = temp.row(y - 2);
let rm1 = temp.row(y - 1);
let r0 = temp.row(y);
let rp1 = temp.row(y + 1);
let rp2 = temp.row(y + 2);
let out = output.row_mut(y);
let mut x = 0;
while x + 16 <= width {
let vm2 = f32x16::load(token, (&rm2[x..x + 16]).try_into().unwrap());
let vm1 = f32x16::load(token, (&rm1[x..x + 16]).try_into().unwrap());
let v0 = f32x16::load(token, (&r0[x..x + 16]).try_into().unwrap());
let vp1 = f32x16::load(token, (&rp1[x..x + 16]).try_into().unwrap());
let vp2 = f32x16::load(token, (&rp2[x..x + 16]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 16]).try_into().unwrap());
x += 16;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
for y in v_interior_end..height {
let iy = y as i32;
let rm2 = temp.row(mirror(iy - 2, iheight));
let rm1 = temp.row(mirror(iy - 1, iheight));
let r0 = temp.row(y);
let rp1 = temp.row(mirror(iy + 1, iheight));
let rp2 = temp.row(mirror(iy + 2, iheight));
let out = output.row_mut(y);
let mut x = 0;
while x + 16 <= width {
let vm2 = f32x16::load(token, (&rm2[x..x + 16]).try_into().unwrap());
let vm1 = f32x16::load(token, (&rm1[x..x + 16]).try_into().unwrap());
let v0 = f32x16::load(token, (&r0[x..x + 16]).try_into().unwrap());
let vp1 = f32x16::load(token, (&rp1[x..x + 16]).try_into().unwrap());
let vp2 = f32x16::load(token, (&rp2[x..x + 16]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 16]).try_into().unwrap());
x += 16;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
temp.recycle(pool);
output
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn blur_mirrored_5x5_v3(
token: archmage::X64V3Token,
input: &ImageF,
weights: &[f32; 3],
pool: &BufferPool,
) -> ImageF {
use magetypes::simd::f32x8;
let width = input.width();
let height = input.height();
let w0 = weights[0];
let w1 = weights[1];
let w2 = weights[2];
let w0_v = f32x8::splat(token, w0);
let w1_v = f32x8::splat(token, w1);
let w2_v = f32x8::splat(token, w2);
let iwidth = width as i32;
let iheight = height as i32;
let mut temp = ImageF::from_pool_dirty(width, height, pool);
let border = 2.min(width);
let interior_end = if width > 4 { width - 2 } else { 0 };
for y in 0..height {
let row = input.row(y);
let out_row = temp.row_mut(y);
for x in 0..border {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
}
let mut x = border;
while x + 8 <= interior_end {
let v_m2 = f32x8::load(token, (&row[x - 2..x + 6]).try_into().unwrap());
let v_m1 = f32x8::load(token, (&row[x - 1..x + 7]).try_into().unwrap());
let v_0 = f32x8::load(token, (&row[x..x + 8]).try_into().unwrap());
let v_p1 = f32x8::load(token, (&row[x + 1..x + 9]).try_into().unwrap());
let v_p2 = f32x8::load(token, (&row[x + 2..x + 10]).try_into().unwrap());
let sum = v_0 * w0_v + (v_m1 + v_p1) * w1_v + (v_m2 + v_p2) * w2_v;
sum.store((&mut out_row[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < interior_end {
let v_m2 = row[x - 2];
let v_m1 = row[x - 1];
let v_0 = row[x];
let v_p1 = row[x + 1];
let v_p2 = row[x + 2];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
x += 1;
}
for x in interior_end..width {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
}
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
let v_border = 2.min(height);
let v_interior_end = if height > 4 { height - 2 } else { 0 };
for y in 0..v_border {
let iy = y as i32;
let rm2 = temp.row(mirror(iy - 2, iheight));
let rm1 = temp.row(mirror(iy - 1, iheight));
let r0 = temp.row(y);
let rp1 = temp.row(mirror(iy + 1, iheight));
let rp2 = temp.row(mirror(iy + 2, iheight));
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
for y in v_border..v_interior_end {
let rm2 = temp.row(y - 2);
let rm1 = temp.row(y - 1);
let r0 = temp.row(y);
let rp1 = temp.row(y + 1);
let rp2 = temp.row(y + 2);
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
for y in v_interior_end..height {
let iy = y as i32;
let rm2 = temp.row(mirror(iy - 2, iheight));
let rm1 = temp.row(mirror(iy - 1, iheight));
let r0 = temp.row(y);
let rp1 = temp.row(mirror(iy + 1, iheight));
let rp2 = temp.row(mirror(iy + 2, iheight));
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
temp.recycle(pool);
output
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
fn blur_mirrored_5x5_neon(
token: archmage::NeonToken,
input: &ImageF,
weights: &[f32; 3],
pool: &BufferPool,
) -> ImageF {
use magetypes::simd::f32x8;
let width = input.width();
let height = input.height();
let w0 = weights[0];
let w1 = weights[1];
let w2 = weights[2];
let w0_v = f32x8::splat(token, w0);
let w1_v = f32x8::splat(token, w1);
let w2_v = f32x8::splat(token, w2);
let iwidth = width as i32;
let iheight = height as i32;
let mut temp = ImageF::from_pool_dirty(width, height, pool);
let border = 2.min(width);
let interior_end = if width > 4 { width - 2 } else { 0 };
for y in 0..height {
let row = input.row(y);
let out_row = temp.row_mut(y);
for x in 0..border {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
}
let mut x = border;
while x + 8 <= interior_end {
let v_m2 = f32x8::load(token, (&row[x - 2..x + 6]).try_into().unwrap());
let v_m1 = f32x8::load(token, (&row[x - 1..x + 7]).try_into().unwrap());
let v_0 = f32x8::load(token, (&row[x..x + 8]).try_into().unwrap());
let v_p1 = f32x8::load(token, (&row[x + 1..x + 9]).try_into().unwrap());
let v_p2 = f32x8::load(token, (&row[x + 2..x + 10]).try_into().unwrap());
let sum = v_0 * w0_v + (v_m1 + v_p1) * w1_v + (v_m2 + v_p2) * w2_v;
sum.store((&mut out_row[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < interior_end {
let v_m2 = row[x - 2];
let v_m1 = row[x - 1];
let v_0 = row[x];
let v_p1 = row[x + 1];
let v_p2 = row[x + 2];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
x += 1;
}
for x in interior_end..width {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
}
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
let v_border = 2.min(height);
let v_interior_end = if height > 4 { height - 2 } else { 0 };
for y in 0..v_border {
let iy = y as i32;
let rm2 = temp.row(mirror(iy - 2, iheight));
let rm1 = temp.row(mirror(iy - 1, iheight));
let r0 = temp.row(y);
let rp1 = temp.row(mirror(iy + 1, iheight));
let rp2 = temp.row(mirror(iy + 2, iheight));
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
for y in v_border..v_interior_end {
let rm2 = temp.row(y - 2);
let rm1 = temp.row(y - 1);
let r0 = temp.row(y);
let rp1 = temp.row(y + 1);
let rp2 = temp.row(y + 2);
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
for y in v_interior_end..height {
let iy = y as i32;
let rm2 = temp.row(mirror(iy - 2, iheight));
let rm1 = temp.row(mirror(iy - 1, iheight));
let r0 = temp.row(y);
let rp1 = temp.row(mirror(iy + 1, iheight));
let rp2 = temp.row(mirror(iy + 2, iheight));
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
temp.recycle(pool);
output
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
fn blur_mirrored_5x5_wasm128(
token: archmage::Wasm128Token,
input: &ImageF,
weights: &[f32; 3],
pool: &BufferPool,
) -> ImageF {
use magetypes::simd::f32x8;
let width = input.width();
let height = input.height();
let w0 = weights[0];
let w1 = weights[1];
let w2 = weights[2];
let w0_v = f32x8::splat(token, w0);
let w1_v = f32x8::splat(token, w1);
let w2_v = f32x8::splat(token, w2);
let iwidth = width as i32;
let iheight = height as i32;
let mut temp = ImageF::from_pool_dirty(width, height, pool);
let border = 2.min(width);
let interior_end = if width > 4 { width - 2 } else { 0 };
for y in 0..height {
let row = input.row(y);
let out_row = temp.row_mut(y);
for x in 0..border {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
}
let mut x = border;
while x + 8 <= interior_end {
let v_m2 = f32x8::load(token, (&row[x - 2..x + 6]).try_into().unwrap());
let v_m1 = f32x8::load(token, (&row[x - 1..x + 7]).try_into().unwrap());
let v_0 = f32x8::load(token, (&row[x..x + 8]).try_into().unwrap());
let v_p1 = f32x8::load(token, (&row[x + 1..x + 9]).try_into().unwrap());
let v_p2 = f32x8::load(token, (&row[x + 2..x + 10]).try_into().unwrap());
let sum = v_0 * w0_v + (v_m1 + v_p1) * w1_v + (v_m2 + v_p2) * w2_v;
sum.store((&mut out_row[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < interior_end {
let v_m2 = row[x - 2];
let v_m1 = row[x - 1];
let v_0 = row[x];
let v_p1 = row[x + 1];
let v_p2 = row[x + 2];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
x += 1;
}
for x in interior_end..width {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
out_row[x] = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
}
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
let v_border = 2.min(height);
let v_interior_end = if height > 4 { height - 2 } else { 0 };
for y in 0..v_border {
let iy = y as i32;
let rm2 = temp.row(mirror(iy - 2, iheight));
let rm1 = temp.row(mirror(iy - 1, iheight));
let r0 = temp.row(y);
let rp1 = temp.row(mirror(iy + 1, iheight));
let rp2 = temp.row(mirror(iy + 2, iheight));
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
for y in v_border..v_interior_end {
let rm2 = temp.row(y - 2);
let rm1 = temp.row(y - 1);
let r0 = temp.row(y);
let rp1 = temp.row(y + 1);
let rp2 = temp.row(y + 2);
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
for y in v_interior_end..height {
let iy = y as i32;
let rm2 = temp.row(mirror(iy - 2, iheight));
let rm1 = temp.row(mirror(iy - 1, iheight));
let r0 = temp.row(y);
let rp1 = temp.row(mirror(iy + 1, iheight));
let rp2 = temp.row(mirror(iy + 2, iheight));
let out = output.row_mut(y);
let mut x = 0;
while x + 8 <= width {
let vm2 = f32x8::load(token, (&rm2[x..x + 8]).try_into().unwrap());
let vm1 = f32x8::load(token, (&rm1[x..x + 8]).try_into().unwrap());
let v0 = f32x8::load(token, (&r0[x..x + 8]).try_into().unwrap());
let vp1 = f32x8::load(token, (&rp1[x..x + 8]).try_into().unwrap());
let vp2 = f32x8::load(token, (&rp2[x..x + 8]).try_into().unwrap());
let sum = v0 * w0_v + (vm1 + vp1) * w1_v + (vm2 + vp2) * w2_v;
sum.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < width {
out[x] = r0[x] * w0 + (rm1[x] + rp1[x]) * w1 + (rm2[x] + rp2[x]) * w2;
x += 1;
}
}
temp.recycle(pool);
output
}
fn blur_mirrored_5x5_scalar(
_token: archmage::ScalarToken,
input: &ImageF,
weights: &[f32; 3],
pool: &BufferPool,
) -> ImageF {
let width = input.width();
let height = input.height();
let w0 = weights[0];
let w1 = weights[1];
let w2 = weights[2];
let iwidth = width as i32;
let iheight = height as i32;
let mut temp = ImageF::from_pool_dirty(height, width, pool);
for y in 0..height {
let row = input.row(y);
for x in 0..width {
let ix = x as i32;
let v_m2 = row[mirror(ix - 2, iwidth)];
let v_m1 = row[mirror(ix - 1, iwidth)];
let v_0 = row[x];
let v_p1 = row[mirror(ix + 1, iwidth)];
let v_p2 = row[mirror(ix + 2, iwidth)];
let sum = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
temp.set(y, x, sum);
}
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
for x in 0..width {
let col = temp.row(x);
for y in 0..height {
let iy = y as i32;
let v_m2 = col[mirror(iy - 2, iheight)];
let v_m1 = col[mirror(iy - 1, iheight)];
let v_0 = col[y];
let v_p1 = col[mirror(iy + 1, iheight)];
let v_p2 = col[mirror(iy + 2, iheight)];
let sum = v_0 * w0 + (v_m1 + v_p1) * w1 + (v_m2 + v_p2) * w2;
output.set(x, y, sum);
}
}
temp.recycle(pool);
output
}
pub fn blur_5x5(input: &ImageF, weights: &[f32; 3], pool: &BufferPool) -> ImageF {
let width = input.width();
let height = input.height();
let w0 = weights[0];
let w1 = weights[1];
let w2 = weights[2];
let kernel = [w2, w1, w0, w1, w2];
let weight_sum: f32 = kernel.iter().sum();
let scale = 1.0 / weight_sum;
let scaled_kernel: [f32; 5] = [
kernel[0] * scale,
kernel[1] * scale,
kernel[2] * scale,
kernel[3] * scale,
kernel[4] * scale,
];
let mut temp = ImageF::from_pool_dirty(height, width, pool);
let border = 2.min(width);
let interior_end = if width > 2 { width - 2 } else { 0 };
for x in 0..border {
for y in 0..height {
let row = input.row(y);
let minx = x.saturating_sub(2);
let maxx = (x + 2).min(width - 1);
let mut sum = 0.0f32;
let mut wsum = 0.0f32;
for j in minx..=maxx {
let k_idx = j + 2 - x;
let k_val = kernel[k_idx];
wsum += k_val;
sum += row[j] * k_val;
}
temp.set(y, x, if wsum > 0.0 { sum / wsum } else { 0.0 });
}
}
if interior_end > border {
for y in 0..height {
let row = input.row(y);
for x in border..interior_end {
let sum = row[x - 2] * scaled_kernel[0]
+ row[x - 1] * scaled_kernel[1]
+ row[x] * scaled_kernel[2]
+ row[x + 1] * scaled_kernel[3]
+ row[x + 2] * scaled_kernel[4];
temp.set(y, x, sum);
}
}
}
for x in interior_end..width {
for y in 0..height {
let row = input.row(y);
let minx = x.saturating_sub(2);
let maxx = (x + 2).min(width - 1);
let mut sum = 0.0f32;
let mut wsum = 0.0f32;
for j in minx..=maxx {
let k_idx = j + 2 - x;
let k_val = kernel[k_idx];
wsum += k_val;
sum += row[j] * k_val;
}
temp.set(y, x, if wsum > 0.0 { sum / wsum } else { 0.0 });
}
}
let mut output = ImageF::from_pool_dirty(width, height, pool);
let h_border = 2.min(height);
let h_interior_end = if height > 2 { height - 2 } else { 0 };
for y in 0..h_border {
for x in 0..width {
let col = temp.row(x);
let miny = y.saturating_sub(2);
let maxy = (y + 2).min(height - 1);
let mut sum = 0.0f32;
let mut wsum = 0.0f32;
for j in miny..=maxy {
let k_idx = j + 2 - y;
let k_val = kernel[k_idx];
wsum += k_val;
sum += col[j] * k_val;
}
output.set(x, y, if wsum > 0.0 { sum / wsum } else { 0.0 });
}
}
if h_interior_end > h_border {
for x in 0..width {
let col = temp.row(x);
for y in h_border..h_interior_end {
let sum = col[y - 2] * scaled_kernel[0]
+ col[y - 1] * scaled_kernel[1]
+ col[y] * scaled_kernel[2]
+ col[y + 1] * scaled_kernel[3]
+ col[y + 2] * scaled_kernel[4];
output.set(x, y, sum);
}
}
}
for y in h_interior_end..height {
for x in 0..width {
let col = temp.row(x);
let miny = y.saturating_sub(2);
let maxy = (y + 2).min(height - 1);
let mut sum = 0.0f32;
let mut wsum = 0.0f32;
for j in miny..=maxy {
let k_idx = j + 2 - y;
let k_val = kernel[k_idx];
wsum += k_val;
sum += col[j] * k_val;
}
output.set(x, y, if wsum > 0.0 { sum / wsum } else { 0.0 });
}
}
temp.recycle(pool);
output
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_kernel_generation() {
let kernel = compute_kernel(1.0);
assert!(!kernel.is_empty());
assert_eq!(kernel.len() % 2, 1);
let center = kernel.len() / 2;
for (i, &v) in kernel.iter().enumerate() {
if i != center {
assert!(v <= kernel[center]);
}
}
let sum: f32 = kernel.iter().sum();
assert!(sum > 0.0);
}
#[cfg(not(feature = "iir-blur"))]
#[test]
fn test_blur_constant_image() {
let pool = BufferPool::new();
let img = ImageF::filled(32, 32, 0.5);
let blurred = gaussian_blur(&img, 2.0, &pool);
for y in 2..30 {
for x in 2..30 {
assert!(
(blurred.get(x, y) - 0.5).abs() < 0.01,
"Expected 0.5, got {} at ({}, {})",
blurred.get(x, y),
x,
y
);
}
}
}
#[test]
fn test_blur_reduces_delta() {
let pool = BufferPool::new();
let mut img = ImageF::new(32, 32);
img.set(16, 16, 1.0);
let blurred = gaussian_blur(&img, 2.0, &pool);
assert!(blurred.get(16, 16) < 1.0);
assert!(blurred.get(15, 16) > 0.0);
assert!(blurred.get(17, 16) > 0.0);
}
#[test]
fn test_blur_5x5_constant() {
let pool = BufferPool::new();
let img = ImageF::filled(32, 32, 0.5);
let weights = [1.0f32, 0.5, 0.25]; let blurred = blur_5x5(&img, &weights, &pool);
for y in 4..28 {
for x in 4..28 {
assert!(
(blurred.get(x, y) - 0.5).abs() < 0.01,
"Expected 0.5, got {} at ({}, {})",
blurred.get(x, y),
x,
y
);
}
}
}
}