#![allow(clippy::too_many_arguments)]
use crate::image::{BufferPool, ImageF};
#[cfg(feature = "unsafe-performance")]
#[allow(clippy::inline_always)]
#[inline(always)]
fn data_at(data: &[f32], idx: usize) -> f32 {
unsafe { *data.get_unchecked(idx) }
}
#[cfg(not(feature = "unsafe-performance"))]
#[inline]
fn data_at(data: &[f32], idx: usize) -> f32 {
data[idx]
}
#[cfg(feature = "unsafe-performance")]
#[allow(clippy::inline_always)]
#[inline(always)]
fn load_8(data: &[f32], start: usize) -> &[f32; 8] {
unsafe { &*data.as_ptr().add(start).cast::<[f32; 8]>() }
}
#[cfg(not(feature = "unsafe-performance"))]
#[inline]
fn load_8(data: &[f32], start: usize) -> &[f32; 8] {
data[start..start + 8].try_into().unwrap()
}
#[cfg(feature = "unsafe-performance")]
#[allow(clippy::inline_always)]
#[inline(always)]
fn load_16(data: &[f32], start: usize) -> &[f32; 16] {
unsafe { &*data.as_ptr().add(start).cast::<[f32; 16]>() }
}
#[cfg(not(feature = "unsafe-performance"))]
#[inline]
fn load_16(data: &[f32], start: usize) -> &[f32; 16] {
data[start..start + 16].try_into().unwrap()
}
#[cfg(test)]
macro_rules! w {
($window:expr, $dx:expr, $dy:expr) => {
$window[((4 + $dy) * 9 + (4 + $dx)) as usize]
};
}
#[cfg(test)]
#[inline]
fn malta_unit_window(window: &[f32; 81]) -> f32 {
let mut retval = 0.0f32;
{
let sum = w!(window, -4, 0)
+ w!(window, -3, 0)
+ w!(window, -2, 0)
+ w!(window, -1, 0)
+ w!(window, 0, 0)
+ w!(window, 1, 0)
+ w!(window, 2, 0)
+ w!(window, 3, 0)
+ w!(window, 4, 0);
retval += sum * sum;
}
{
let sum = w!(window, 0, -4)
+ w!(window, 0, -3)
+ w!(window, 0, -2)
+ w!(window, 0, -1)
+ w!(window, 0, 0)
+ w!(window, 0, 1)
+ w!(window, 0, 2)
+ w!(window, 0, 3)
+ w!(window, 0, 4);
retval += sum * sum;
}
{
let sum = w!(window, -3, -3)
+ w!(window, -2, -2)
+ w!(window, -1, -1)
+ w!(window, 0, 0)
+ w!(window, 1, 1)
+ w!(window, 2, 2)
+ w!(window, 3, 3);
retval += sum * sum;
}
{
let sum = w!(window, 3, -3)
+ w!(window, 2, -2)
+ w!(window, 1, -1)
+ w!(window, 0, 0)
+ w!(window, -1, 1)
+ w!(window, -2, 2)
+ w!(window, -3, 3);
retval += sum * sum;
}
{
let sum = w!(window, 1, -4)
+ w!(window, 1, -3)
+ w!(window, 1, -2)
+ w!(window, 0, -1)
+ w!(window, 0, 0)
+ w!(window, 0, 1)
+ w!(window, -1, 2)
+ w!(window, -1, 3)
+ w!(window, -1, 4);
retval += sum * sum;
}
{
let sum = w!(window, -1, -4)
+ w!(window, -1, -3)
+ w!(window, -1, -2)
+ w!(window, 0, -1)
+ w!(window, 0, 0)
+ w!(window, 0, 1)
+ w!(window, 1, 2)
+ w!(window, 1, 3)
+ w!(window, 1, 4);
retval += sum * sum;
}
{
let sum = w!(window, -4, -1)
+ w!(window, -3, -1)
+ w!(window, -2, -1)
+ w!(window, -1, 0)
+ w!(window, 0, 0)
+ w!(window, 1, 0)
+ w!(window, 2, 1)
+ w!(window, 3, 1)
+ w!(window, 4, 1);
retval += sum * sum;
}
{
let sum = w!(window, -4, 1)
+ w!(window, -3, 1)
+ w!(window, -2, 1)
+ w!(window, -1, 0)
+ w!(window, 0, 0)
+ w!(window, 1, 0)
+ w!(window, 2, -1)
+ w!(window, 3, -1)
+ w!(window, 4, -1);
retval += sum * sum;
}
{
let sum = w!(window, -2, -3)
+ w!(window, -1, -2)
+ w!(window, -1, -1)
+ w!(window, 0, 0)
+ w!(window, 1, 1)
+ w!(window, 1, 2)
+ w!(window, 2, 3);
retval += sum * sum;
}
{
let sum = w!(window, 2, -3)
+ w!(window, 1, -2)
+ w!(window, 1, -1)
+ w!(window, 0, 0)
+ w!(window, -1, 1)
+ w!(window, -1, 2)
+ w!(window, -2, 3);
retval += sum * sum;
}
{
let sum = w!(window, -3, -2)
+ w!(window, -2, -1)
+ w!(window, -1, -1)
+ w!(window, 0, 0)
+ w!(window, 1, 1)
+ w!(window, 2, 1)
+ w!(window, 3, 2);
retval += sum * sum;
}
{
let sum = w!(window, 3, -2)
+ w!(window, 2, -1)
+ w!(window, 1, -1)
+ w!(window, 0, 0)
+ w!(window, -1, 1)
+ w!(window, -2, 1)
+ w!(window, -3, 2);
retval += sum * sum;
}
{
let sum = w!(window, -4, 1)
+ w!(window, -3, 1)
+ w!(window, -2, 1)
+ w!(window, -1, 0)
+ w!(window, 0, 0)
+ w!(window, 1, 0)
+ w!(window, 2, -1)
+ w!(window, 3, -1)
+ w!(window, 4, -1);
retval += sum * sum;
}
{
let sum = w!(window, -4, -1)
+ w!(window, -3, -1)
+ w!(window, -2, -1)
+ w!(window, -1, 0)
+ w!(window, 0, 0)
+ w!(window, 1, 0)
+ w!(window, 2, 1)
+ w!(window, 3, 1)
+ w!(window, 4, 1);
retval += sum * sum;
}
{
let sum = w!(window, -1, -4)
+ w!(window, -1, -3)
+ w!(window, -1, -2)
+ w!(window, 0, -1)
+ w!(window, 0, 0)
+ w!(window, 0, 1)
+ w!(window, 1, 2)
+ w!(window, 1, 3)
+ w!(window, 1, 4);
retval += sum * sum;
}
{
let sum = w!(window, 1, -4)
+ w!(window, 1, -3)
+ w!(window, 1, -2)
+ w!(window, 0, -1)
+ w!(window, 0, 0)
+ w!(window, 0, 1)
+ w!(window, -1, 2)
+ w!(window, -1, 3)
+ w!(window, -1, 4);
retval += sum * sum;
}
retval
}
#[cfg(test)]
#[inline]
fn malta_unit_lf_window(window: &[f32; 81]) -> f32 {
let mut retval = 0.0f32;
{
let sum = w!(window, -4, 0)
+ w!(window, -2, 0)
+ w!(window, 0, 0)
+ w!(window, 2, 0)
+ w!(window, 4, 0);
retval += sum * sum;
}
{
let sum = w!(window, 0, -4)
+ w!(window, 0, -2)
+ w!(window, 0, 0)
+ w!(window, 0, 2)
+ w!(window, 0, 4);
retval += sum * sum;
}
{
let sum = w!(window, -3, -3)
+ w!(window, -2, -2)
+ w!(window, 0, 0)
+ w!(window, 2, 2)
+ w!(window, 3, 3);
retval += sum * sum;
}
{
let sum = w!(window, 3, -3)
+ w!(window, 2, -2)
+ w!(window, 0, 0)
+ w!(window, -2, 2)
+ w!(window, -3, 3);
retval += sum * sum;
}
{
let sum = w!(window, 1, -4)
+ w!(window, 1, -2)
+ w!(window, 0, 0)
+ w!(window, -1, 2)
+ w!(window, -1, 4);
retval += sum * sum;
}
{
let sum = w!(window, -1, -4)
+ w!(window, -1, -2)
+ w!(window, 0, 0)
+ w!(window, 1, 2)
+ w!(window, 1, 4);
retval += sum * sum;
}
{
let sum = w!(window, -4, -1)
+ w!(window, -2, -1)
+ w!(window, 0, 0)
+ w!(window, 2, 1)
+ w!(window, 4, 1);
retval += sum * sum;
}
{
let sum = w!(window, -4, 1)
+ w!(window, -2, 1)
+ w!(window, 0, 0)
+ w!(window, 2, -1)
+ w!(window, 4, -1);
retval += sum * sum;
}
{
let sum = w!(window, -2, -3)
+ w!(window, -1, -2)
+ w!(window, 0, 0)
+ w!(window, 1, 2)
+ w!(window, 2, 3);
retval += sum * sum;
}
{
let sum = w!(window, 2, -3)
+ w!(window, 1, -2)
+ w!(window, 0, 0)
+ w!(window, -1, 2)
+ w!(window, -2, 3);
retval += sum * sum;
}
{
let sum = w!(window, -3, -2)
+ w!(window, -2, -1)
+ w!(window, 0, 0)
+ w!(window, 2, 1)
+ w!(window, 3, 2);
retval += sum * sum;
}
{
let sum = w!(window, 3, -2)
+ w!(window, 2, -1)
+ w!(window, 0, 0)
+ w!(window, -2, 1)
+ w!(window, -3, 2);
retval += sum * sum;
}
{
let sum = w!(window, -4, 2)
+ w!(window, -2, 1)
+ w!(window, 0, 0)
+ w!(window, 2, -1)
+ w!(window, 4, -2);
retval += sum * sum;
}
{
let sum = w!(window, -4, -2)
+ w!(window, -2, -1)
+ w!(window, 0, 0)
+ w!(window, 2, 1)
+ w!(window, 4, 2);
retval += sum * sum;
}
{
let sum = w!(window, -2, -4)
+ w!(window, -1, -2)
+ w!(window, 0, 0)
+ w!(window, 1, 2)
+ w!(window, 2, 4);
retval += sum * sum;
}
{
let sum = w!(window, 2, -4)
+ w!(window, 1, -2)
+ w!(window, 0, 0)
+ w!(window, -1, 2)
+ w!(window, -2, 4);
retval += sum * sum;
}
retval
}
#[inline]
#[cfg(test)]
fn extract_window(data: &ImageF, x: usize, y: usize) -> [f32; 81] {
let width = data.width();
let height = data.height();
let mut window = [0.0f32; 81];
if x >= 4 && y >= 4 && x < width - 4 && y < height - 4 {
for dy in 0..9 {
let src_y = y + dy - 4;
let row = data.row(src_y);
let dst_start = dy * 9;
let src_start = x - 4;
window[dst_start..dst_start + 9].copy_from_slice(&row[src_start..src_start + 9]);
}
} else {
let sy_min = if y >= 4 { 0 } else { 4 - y };
let sy_max = 9.min(height.wrapping_sub(y).wrapping_add(4));
let sx_min = if x >= 4 { 0 } else { 4 - x };
let sx_max = 9.min(width.wrapping_sub(x).wrapping_add(4));
for dy in sy_min..sy_max {
let src_y = y + dy - 4;
let src_x = x + sx_min - 4;
let row = data.row(src_y);
let count = sx_max - sx_min;
let dst_start = dy * 9 + sx_min;
window[dst_start..dst_start + count].copy_from_slice(&row[src_x..src_x + count]);
}
}
window
}
#[inline]
fn malta_unit_interior(data: &[f32], center: usize, stride: usize) -> f32 {
let xs = stride;
let xs2 = xs + xs;
let xs3 = xs2 + xs;
let xs4 = xs3 + xs;
let reach = xs4 + 4;
assert!(center >= reach && center + reach < data.len());
macro_rules! at {
($off:expr) => {
data_at(data, ($off) as usize)
};
}
let c = center;
let mut retval = 0.0f32;
{
let sum = at!(c - 4)
+ at!(c - 3)
+ at!(c - 2)
+ at!(c - 1)
+ at!(c)
+ at!(c + 1)
+ at!(c + 2)
+ at!(c + 3)
+ at!(c + 4);
retval += sum * sum;
}
{
let sum = at!(c - xs4)
+ at!(c - xs3)
+ at!(c - xs2)
+ at!(c - xs)
+ at!(c)
+ at!(c + xs)
+ at!(c + xs2)
+ at!(c + xs3)
+ at!(c + xs4);
retval += sum * sum;
}
{
let sum = at!(c - xs3 - 3)
+ at!(c - xs2 - 2)
+ at!(c - xs - 1)
+ at!(c)
+ at!(c + xs + 1)
+ at!(c + xs2 + 2)
+ at!(c + xs3 + 3);
retval += sum * sum;
}
{
let sum = at!(c - xs3 + 3)
+ at!(c - xs2 + 2)
+ at!(c - xs + 1)
+ at!(c)
+ at!(c + xs - 1)
+ at!(c + xs2 - 2)
+ at!(c + xs3 - 3);
retval += sum * sum;
}
{
let sum = at!(c - xs4 + 1)
+ at!(c - xs3 + 1)
+ at!(c - xs2 + 1)
+ at!(c - xs)
+ at!(c)
+ at!(c + xs)
+ at!(c + xs2 - 1)
+ at!(c + xs3 - 1)
+ at!(c + xs4 - 1);
retval += sum * sum;
}
{
let sum = at!(c - xs4 - 1)
+ at!(c - xs3 - 1)
+ at!(c - xs2 - 1)
+ at!(c - xs)
+ at!(c)
+ at!(c + xs)
+ at!(c + xs2 + 1)
+ at!(c + xs3 + 1)
+ at!(c + xs4 + 1);
retval += sum * sum;
}
{
let sum = at!(c - 4 - xs)
+ at!(c - 3 - xs)
+ at!(c - 2 - xs)
+ at!(c - 1)
+ at!(c)
+ at!(c + 1)
+ at!(c + 2 + xs)
+ at!(c + 3 + xs)
+ at!(c + 4 + xs);
retval += sum * sum;
}
{
let sum = at!(c - 4 + xs)
+ at!(c - 3 + xs)
+ at!(c - 2 + xs)
+ at!(c - 1)
+ at!(c)
+ at!(c + 1)
+ at!(c + 2 - xs)
+ at!(c + 3 - xs)
+ at!(c + 4 - xs);
retval += sum * sum;
}
{
let sum = at!(c - xs3 - 2)
+ at!(c - xs2 - 1)
+ at!(c - xs - 1)
+ at!(c)
+ at!(c + xs + 1)
+ at!(c + xs2 + 1)
+ at!(c + xs3 + 2);
retval += sum * sum;
}
{
let sum = at!(c - xs3 + 2)
+ at!(c - xs2 + 1)
+ at!(c - xs + 1)
+ at!(c)
+ at!(c + xs - 1)
+ at!(c + xs2 - 1)
+ at!(c + xs3 - 2);
retval += sum * sum;
}
{
let sum = at!(c - xs2 - 3)
+ at!(c - xs - 2)
+ at!(c - xs - 1)
+ at!(c)
+ at!(c + xs + 1)
+ at!(c + xs + 2)
+ at!(c + xs2 + 3);
retval += sum * sum;
}
{
let sum = at!(c - xs2 + 3)
+ at!(c - xs + 2)
+ at!(c - xs + 1)
+ at!(c)
+ at!(c + xs - 1)
+ at!(c + xs - 2)
+ at!(c + xs2 - 3);
retval += sum * sum;
}
{
let sum = at!(c - 4 + xs)
+ at!(c - 3 + xs)
+ at!(c - 2 + xs)
+ at!(c - 1)
+ at!(c)
+ at!(c + 1)
+ at!(c + 2 - xs)
+ at!(c + 3 - xs)
+ at!(c + 4 - xs);
retval += sum * sum;
}
{
let sum = at!(c - 4 - xs)
+ at!(c - 3 - xs)
+ at!(c - 2 - xs)
+ at!(c - 1)
+ at!(c)
+ at!(c + 1)
+ at!(c + 2 + xs)
+ at!(c + 3 + xs)
+ at!(c + 4 + xs);
retval += sum * sum;
}
{
let sum = at!(c - xs4 - 1)
+ at!(c - xs3 - 1)
+ at!(c - xs2 - 1)
+ at!(c - xs)
+ at!(c)
+ at!(c + xs)
+ at!(c + xs2 + 1)
+ at!(c + xs3 + 1)
+ at!(c + xs4 + 1);
retval += sum * sum;
}
{
let sum = at!(c - xs4 + 1)
+ at!(c - xs3 + 1)
+ at!(c - xs2 + 1)
+ at!(c - xs)
+ at!(c)
+ at!(c + xs)
+ at!(c + xs2 - 1)
+ at!(c + xs3 - 1)
+ at!(c + xs4 - 1);
retval += sum * sum;
}
retval
}
#[inline]
fn malta_unit_lf_interior(data: &[f32], center: usize, stride: usize) -> f32 {
let xs = stride;
let xs2 = xs + xs;
let xs3 = xs2 + xs;
let xs4 = xs3 + xs;
let reach = xs4 + 4;
assert!(center >= reach && center + reach < data.len());
macro_rules! at {
($off:expr) => {
data_at(data, ($off) as usize)
};
}
let c = center;
let mut retval = 0.0f32;
{
let sum = at!(c - 4) + at!(c - 2) + at!(c) + at!(c + 2) + at!(c + 4);
retval += sum * sum;
}
{
let sum = at!(c - xs4) + at!(c - xs2) + at!(c) + at!(c + xs2) + at!(c + xs4);
retval += sum * sum;
}
{
let sum =
at!(c - xs3 - 3) + at!(c - xs2 - 2) + at!(c) + at!(c + xs2 + 2) + at!(c + xs3 + 3);
retval += sum * sum;
}
{
let sum =
at!(c - xs3 + 3) + at!(c - xs2 + 2) + at!(c) + at!(c + xs2 - 2) + at!(c + xs3 - 3);
retval += sum * sum;
}
{
let sum =
at!(c - xs4 + 1) + at!(c - xs2 + 1) + at!(c) + at!(c + xs2 - 1) + at!(c + xs4 - 1);
retval += sum * sum;
}
{
let sum =
at!(c - xs4 - 1) + at!(c - xs2 - 1) + at!(c) + at!(c + xs2 + 1) + at!(c + xs4 + 1);
retval += sum * sum;
}
{
let sum = at!(c - 4 - xs) + at!(c - 2 - xs) + at!(c) + at!(c + 2 + xs) + at!(c + 4 + xs);
retval += sum * sum;
}
{
let sum = at!(c - 4 + xs) + at!(c - 2 + xs) + at!(c) + at!(c + 2 - xs) + at!(c + 4 - xs);
retval += sum * sum;
}
{
let sum =
at!(c - xs3 - 2) + at!(c - xs2 - 1) + at!(c) + at!(c + xs2 + 1) + at!(c + xs3 + 2);
retval += sum * sum;
}
{
let sum =
at!(c - xs3 + 2) + at!(c - xs2 + 1) + at!(c) + at!(c + xs2 - 1) + at!(c + xs3 - 2);
retval += sum * sum;
}
{
let sum = at!(c - xs2 - 3) + at!(c - xs - 2) + at!(c) + at!(c + xs + 2) + at!(c + xs2 + 3);
retval += sum * sum;
}
{
let sum = at!(c - xs2 + 3) + at!(c - xs + 2) + at!(c) + at!(c + xs - 2) + at!(c + xs2 - 3);
retval += sum * sum;
}
{
let sum = at!(c - 4 + xs2) + at!(c - 2 + xs) + at!(c) + at!(c + 2 - xs) + at!(c + 4 - xs2);
retval += sum * sum;
}
{
let sum = at!(c - 4 - xs2) + at!(c - 2 - xs) + at!(c) + at!(c + 2 + xs) + at!(c + 4 + xs2);
retval += sum * sum;
}
{
let sum =
at!(c - xs4 - 2) + at!(c - xs2 - 1) + at!(c) + at!(c + xs2 + 1) + at!(c + xs4 + 2);
retval += sum * sum;
}
{
let sum =
at!(c - xs4 + 2) + at!(c - xs2 + 1) + at!(c) + at!(c + xs2 - 1) + at!(c + xs4 - 2);
retval += sum * sum;
}
retval
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
fn malta_unit_interior_8x_v3(
token: archmage::X64V3Token,
data: &[f32],
center: usize,
stride: usize,
) -> magetypes::simd::f32x8 {
use magetypes::simd::f32x8;
let xs = stride as isize;
let xs2 = xs * 2;
let xs3 = xs * 3;
let xs4 = xs * 4;
let reach = 4 * stride + 4;
assert!(center >= reach && center + reach + 8 <= data.len());
macro_rules! ld {
($off:expr) => {{
let o: isize = $off;
let start = (center as isize + o) as usize;
f32x8::load(token, load_8(data, start))
}};
}
let mut r = f32x8::splat(token, 0.0);
{
let s = ld!(-4) + ld!(-3) + ld!(-2) + ld!(-1) + ld!(0) + ld!(1) + ld!(2) + ld!(3) + ld!(4);
r += s * s;
}
{
let s = ld!(-xs4)
+ ld!(-xs3)
+ ld!(-xs2)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2)
+ ld!(xs3)
+ ld!(xs4);
r += s * s;
}
{
let s = ld!(-xs3 - 3)
+ ld!(-xs2 - 2)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs2 + 2)
+ ld!(xs3 + 3);
r += s * s;
}
{
let s = ld!(-xs3 + 3)
+ ld!(-xs2 + 2)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs2 - 2)
+ ld!(xs3 - 3);
r += s * s;
}
{
let s = ld!(-xs4 + 1)
+ ld!(-xs3 + 1)
+ ld!(-xs2 + 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 - 1)
+ ld!(xs3 - 1)
+ ld!(xs4 - 1);
r += s * s;
}
{
let s = ld!(-xs4 - 1)
+ ld!(-xs3 - 1)
+ ld!(-xs2 - 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 + 1)
+ ld!(xs3 + 1)
+ ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-4 - xs)
+ ld!(-3 - xs)
+ ld!(-2 - xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 + xs)
+ ld!(3 + xs)
+ ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-4 + xs)
+ ld!(-3 + xs)
+ ld!(-2 + xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 - xs)
+ ld!(3 - xs)
+ ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-xs3 - 2)
+ ld!(-xs2 - 1)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs2 + 1)
+ ld!(xs3 + 2);
r += s * s;
}
{
let s = ld!(-xs3 + 2)
+ ld!(-xs2 + 1)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs2 - 1)
+ ld!(xs3 - 2);
r += s * s;
}
{
let s = ld!(-xs2 - 3)
+ ld!(-xs - 2)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs + 2)
+ ld!(xs2 + 3);
r += s * s;
}
{
let s = ld!(-xs2 + 3)
+ ld!(-xs + 2)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs - 2)
+ ld!(xs2 - 3);
r += s * s;
}
{
let s = ld!(-4 + xs)
+ ld!(-3 + xs)
+ ld!(-2 + xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 - xs)
+ ld!(3 - xs)
+ ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-4 - xs)
+ ld!(-3 - xs)
+ ld!(-2 - xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 + xs)
+ ld!(3 + xs)
+ ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-xs4 - 1)
+ ld!(-xs3 - 1)
+ ld!(-xs2 - 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 + 1)
+ ld!(xs3 + 1)
+ ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-xs4 + 1)
+ ld!(-xs3 + 1)
+ ld!(-xs2 + 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 - 1)
+ ld!(xs3 - 1)
+ ld!(xs4 - 1);
r += s * s;
}
r
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
fn malta_unit_lf_interior_8x_v3(
token: archmage::X64V3Token,
data: &[f32],
center: usize,
stride: usize,
) -> magetypes::simd::f32x8 {
use magetypes::simd::f32x8;
let xs = stride as isize;
let xs2 = xs * 2;
let xs3 = xs * 3;
let xs4 = xs * 4;
let reach = 4 * stride + 4;
assert!(center >= reach && center + reach + 8 <= data.len());
macro_rules! ld {
($off:expr) => {{
let o: isize = $off;
let start = (center as isize + o) as usize;
f32x8::load(token, load_8(data, start))
}};
}
let mut r = f32x8::splat(token, 0.0);
{
let s = ld!(-4) + ld!(-2) + ld!(0) + ld!(2) + ld!(4);
r += s * s;
}
{
let s = ld!(-xs4) + ld!(-xs2) + ld!(0) + ld!(xs2) + ld!(xs4);
r += s * s;
}
{
let s = ld!(-xs3 - 3) + ld!(-xs2 - 2) + ld!(0) + ld!(xs2 + 2) + ld!(xs3 + 3);
r += s * s;
}
{
let s = ld!(-xs3 + 3) + ld!(-xs2 + 2) + ld!(0) + ld!(xs2 - 2) + ld!(xs3 - 3);
r += s * s;
}
{
let s = ld!(-xs4 + 1) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs4 - 1);
r += s * s;
}
{
let s = ld!(-xs4 - 1) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-4 - xs) + ld!(-2 - xs) + ld!(0) + ld!(2 + xs) + ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-4 + xs) + ld!(-2 + xs) + ld!(0) + ld!(2 - xs) + ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-xs3 - 2) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs3 + 2);
r += s * s;
}
{
let s = ld!(-xs3 + 2) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs3 - 2);
r += s * s;
}
{
let s = ld!(-xs2 - 3) + ld!(-xs - 2) + ld!(0) + ld!(xs + 2) + ld!(xs2 + 3);
r += s * s;
}
{
let s = ld!(-xs2 + 3) + ld!(-xs + 2) + ld!(0) + ld!(xs - 2) + ld!(xs2 - 3);
r += s * s;
}
{
let s = ld!(-4 + xs2) + ld!(-2 + xs) + ld!(0) + ld!(2 - xs) + ld!(4 - xs2);
r += s * s;
}
{
let s = ld!(-4 - xs2) + ld!(-2 - xs) + ld!(0) + ld!(2 + xs) + ld!(4 + xs2);
r += s * s;
}
{
let s = ld!(-xs4 - 2) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs4 + 2);
r += s * s;
}
{
let s = ld!(-xs4 + 2) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs4 - 2);
r += s * s;
}
r
}
#[cfg(test)]
pub fn malta_unit(data: &ImageF, x: usize, y: usize) -> f32 {
let window = extract_window(data, x, y);
malta_unit_window(&window)
}
#[cfg(test)]
pub fn malta_unit_lf(data: &ImageF, x: usize, y: usize) -> f32 {
let window = extract_window(data, x, y);
malta_unit_lf_window(&window)
}
#[allow(clippy::too_many_arguments)]
pub fn malta_diff_map(
lum0: &ImageF,
lum1: &ImageF,
w_0gt1: f64,
w_0lt1: f64,
norm1: f64,
use_lf: bool,
pool: &BufferPool,
) -> ImageF {
archmage::incant!(
malta_diff_map_dispatch(lum0, lum1, w_0gt1, w_0lt1, norm1, use_lf, pool),
[v4, v3, neon, wasm128]
)
}
#[archmage::autoversion]
fn malta_compute_scaled_diffs(
_token: archmage::SimdToken,
lum0: &ImageF,
lum1: &ImageF,
norm2_0gt1: f32,
norm2_0lt1: f32,
norm1_f32: f32,
diffs: &mut ImageF,
) {
for y in 0..lum0.height() {
let row0 = lum0.row(y);
let row1 = lum1.row(y);
let out = diffs.row_mut(y);
for (o, (&v0, &v1)) in out.iter_mut().zip(row0.iter().zip(row1.iter())) {
let absval = 0.5 * (v0.abs() + v1.abs());
let inv_norm = 1.0 / (norm1_f32 + absval);
let diff = v0 - v1;
let scaled_diff = norm2_0gt1 * inv_norm * diff;
let fabs0 = v0.abs();
let too_small = 0.55 * fabs0;
let too_big = 1.05 * fabs0;
let sign = 1.0f32.copysign(v0);
let sv1 = v1 * sign;
let below = (too_small - sv1).max(0.0);
let above = (sv1 - too_big).max(0.0);
let impact = norm2_0lt1 * inv_norm * (below - above);
*o = scaled_diff + sign * impact;
}
}
}
#[allow(clippy::inline_always, clippy::too_many_arguments)]
#[inline(always)]
fn malta_diff_map_impl<F>(
lum0: &ImageF,
lum1: &ImageF,
w_0gt1: f64,
w_0lt1: f64,
norm1: f64,
use_lf: bool,
pool: &BufferPool,
interior_row: F,
) -> ImageF
where
F: Fn(&[f32], usize, usize, usize, bool, &mut [f32]),
{
let width = lum0.width();
let height = lum0.height();
const K_WEIGHT0: f64 = 0.5;
const K_WEIGHT1: f64 = 0.33;
const LEN: f64 = 3.75;
let mulli = if use_lf {
0.611612573796
} else {
0.39905817637
};
let w_pre0gt1 = mulli * (K_WEIGHT0 * w_0gt1).sqrt() / (LEN * 2.0 + 1.0);
let w_pre0lt1 = mulli * (K_WEIGHT1 * w_0lt1).sqrt() / (LEN * 2.0 + 1.0);
let norm2_0gt1 = (w_pre0gt1 * norm1) as f32;
let norm2_0lt1 = (w_pre0lt1 * norm1) as f32;
let norm1_f32 = norm1 as f32;
let mut diffs = ImageF::from_pool_dirty(width, height, pool);
malta_compute_scaled_diffs(lum0, lum1, norm2_0gt1, norm2_0lt1, norm1_f32, &mut diffs);
const PAD: usize = 4;
let pad_w = width + 2 * PAD;
let pad_h = height + 2 * PAD;
let mut padded = ImageF::from_pool_dirty(pad_w, pad_h, pool);
let pad_stride = padded.stride();
for y in 0..PAD {
padded.row_full_mut(y)[..pad_stride].fill(0.0);
}
for y in PAD + height..pad_h {
padded.row_full_mut(y)[..pad_stride].fill(0.0);
}
for y in 0..height {
let src = diffs.row(y);
let dst = padded.row_full_mut(y + PAD);
dst[..PAD].fill(0.0);
dst[PAD..PAD + width].copy_from_slice(src);
dst[PAD + width..pad_stride].fill(0.0);
}
diffs.recycle(pool);
let mut block_diff_ac = ImageF::from_pool_dirty(width, height, pool);
let pad_data = padded.data();
for y in 0..height {
let out = block_diff_ac.row_mut(y);
let center_base = (y + PAD) * pad_stride + PAD;
interior_row(pad_data, center_base, pad_stride, width, use_lf, out);
}
padded.recycle(pool);
block_diff_ac
}
#[allow(clippy::too_many_arguments)]
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn malta_diff_map_dispatch_v3(
token: archmage::X64V3Token,
lum0: &ImageF,
lum1: &ImageF,
w_0gt1: f64,
w_0lt1: f64,
norm1: f64,
use_lf: bool,
pool: &BufferPool,
) -> ImageF {
let interior = |data: &[f32],
center_base: usize,
stride: usize,
count: usize,
use_lf: bool,
out: &mut [f32]| {
let mut x = 0;
while x + 8 <= count {
let center = center_base + x;
let results = if use_lf {
malta_unit_lf_interior_8x_v3(token, data, center, stride)
} else {
malta_unit_interior_8x_v3(token, data, center, stride)
};
results.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < count {
let center = center_base + x;
out[x] = if use_lf {
malta_unit_lf_interior(data, center, stride)
} else {
malta_unit_interior(data, center, stride)
};
x += 1;
}
};
malta_diff_map_impl(lum0, lum1, w_0gt1, w_0lt1, norm1, use_lf, pool, interior)
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
fn malta_unit_interior_16x_v4(
token: archmage::X64V4Token,
data: &[f32],
center: usize,
stride: usize,
) -> magetypes::simd::v4::f32x16 {
use magetypes::simd::v4::f32x16;
let xs = stride as isize;
let xs2 = xs * 2;
let xs3 = xs * 3;
let xs4 = xs * 4;
let reach = 4 * stride + 4;
assert!(center >= reach && center + reach + 16 <= data.len());
macro_rules! ld {
($off:expr) => {{
let o: isize = $off;
let start = (center as isize + o) as usize;
f32x16::load(token, load_16(data, start))
}};
}
let mut r = f32x16::splat(token, 0.0);
{
let s = ld!(-4) + ld!(-3) + ld!(-2) + ld!(-1) + ld!(0) + ld!(1) + ld!(2) + ld!(3) + ld!(4);
r += s * s;
}
{
let s = ld!(-xs4)
+ ld!(-xs3)
+ ld!(-xs2)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2)
+ ld!(xs3)
+ ld!(xs4);
r += s * s;
}
{
let s = ld!(-xs3 - 3)
+ ld!(-xs2 - 2)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs2 + 2)
+ ld!(xs3 + 3);
r += s * s;
}
{
let s = ld!(-xs3 + 3)
+ ld!(-xs2 + 2)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs2 - 2)
+ ld!(xs3 - 3);
r += s * s;
}
{
let s = ld!(-xs4 + 1)
+ ld!(-xs3 + 1)
+ ld!(-xs2 + 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 - 1)
+ ld!(xs3 - 1)
+ ld!(xs4 - 1);
r += s * s;
}
{
let s = ld!(-xs4 - 1)
+ ld!(-xs3 - 1)
+ ld!(-xs2 - 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 + 1)
+ ld!(xs3 + 1)
+ ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-4 - xs)
+ ld!(-3 - xs)
+ ld!(-2 - xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 + xs)
+ ld!(3 + xs)
+ ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-4 + xs)
+ ld!(-3 + xs)
+ ld!(-2 + xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 - xs)
+ ld!(3 - xs)
+ ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-xs3 - 2)
+ ld!(-xs2 - 1)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs2 + 1)
+ ld!(xs3 + 2);
r += s * s;
}
{
let s = ld!(-xs3 + 2)
+ ld!(-xs2 + 1)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs2 - 1)
+ ld!(xs3 - 2);
r += s * s;
}
{
let s = ld!(-xs2 - 3)
+ ld!(-xs - 2)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs + 2)
+ ld!(xs2 + 3);
r += s * s;
}
{
let s = ld!(-xs2 + 3)
+ ld!(-xs + 2)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs - 2)
+ ld!(xs2 - 3);
r += s * s;
}
{
let s = ld!(-4 + xs)
+ ld!(-3 + xs)
+ ld!(-2 + xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 - xs)
+ ld!(3 - xs)
+ ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-4 - xs)
+ ld!(-3 - xs)
+ ld!(-2 - xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 + xs)
+ ld!(3 + xs)
+ ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-xs4 - 1)
+ ld!(-xs3 - 1)
+ ld!(-xs2 - 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 + 1)
+ ld!(xs3 + 1)
+ ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-xs4 + 1)
+ ld!(-xs3 + 1)
+ ld!(-xs2 + 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 - 1)
+ ld!(xs3 - 1)
+ ld!(xs4 - 1);
r += s * s;
}
r
}
#[cfg(target_arch = "x86_64")]
#[archmage::rite]
fn malta_unit_lf_interior_16x_v4(
token: archmage::X64V4Token,
data: &[f32],
center: usize,
stride: usize,
) -> magetypes::simd::v4::f32x16 {
use magetypes::simd::v4::f32x16;
let xs = stride as isize;
let xs2 = xs * 2;
let xs3 = xs * 3;
let xs4 = xs * 4;
let reach = 4 * stride + 4;
assert!(center >= reach && center + reach + 16 <= data.len());
macro_rules! ld {
($off:expr) => {{
let o: isize = $off;
let start = (center as isize + o) as usize;
f32x16::load(token, load_16(data, start))
}};
}
let mut r = f32x16::splat(token, 0.0);
{
let s = ld!(-4) + ld!(-2) + ld!(0) + ld!(2) + ld!(4);
r += s * s;
}
{
let s = ld!(-xs4) + ld!(-xs2) + ld!(0) + ld!(xs2) + ld!(xs4);
r += s * s;
}
{
let s = ld!(-xs3 - 3) + ld!(-xs2 - 2) + ld!(0) + ld!(xs2 + 2) + ld!(xs3 + 3);
r += s * s;
}
{
let s = ld!(-xs3 + 3) + ld!(-xs2 + 2) + ld!(0) + ld!(xs2 - 2) + ld!(xs3 - 3);
r += s * s;
}
{
let s = ld!(-xs4 + 1) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs4 - 1);
r += s * s;
}
{
let s = ld!(-xs4 - 1) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-4 - xs) + ld!(-2 - xs) + ld!(0) + ld!(2 + xs) + ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-4 + xs) + ld!(-2 + xs) + ld!(0) + ld!(2 - xs) + ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-xs3 - 2) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs3 + 2);
r += s * s;
}
{
let s = ld!(-xs3 + 2) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs3 - 2);
r += s * s;
}
{
let s = ld!(-xs2 - 3) + ld!(-xs - 2) + ld!(0) + ld!(xs + 2) + ld!(xs2 + 3);
r += s * s;
}
{
let s = ld!(-xs2 + 3) + ld!(-xs + 2) + ld!(0) + ld!(xs - 2) + ld!(xs2 - 3);
r += s * s;
}
{
let s = ld!(-4 + xs2) + ld!(-2 + xs) + ld!(0) + ld!(2 - xs) + ld!(4 - xs2);
r += s * s;
}
{
let s = ld!(-4 - xs2) + ld!(-2 - xs) + ld!(0) + ld!(2 + xs) + ld!(4 + xs2);
r += s * s;
}
{
let s = ld!(-xs4 - 2) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs4 + 2);
r += s * s;
}
{
let s = ld!(-xs4 + 2) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs4 - 2);
r += s * s;
}
r
}
#[allow(clippy::too_many_arguments)]
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
fn malta_diff_map_dispatch_v4(
token: archmage::X64V4Token,
lum0: &ImageF,
lum1: &ImageF,
w_0gt1: f64,
w_0lt1: f64,
norm1: f64,
use_lf: bool,
pool: &BufferPool,
) -> ImageF {
let interior = |data: &[f32],
center_base: usize,
stride: usize,
count: usize,
use_lf: bool,
out: &mut [f32]| {
let mut x = 0;
while x + 16 <= count {
let center = center_base + x;
let results = if use_lf {
malta_unit_lf_interior_16x_v4(token, data, center, stride)
} else {
malta_unit_interior_16x_v4(token, data, center, stride)
};
results.store((&mut out[x..x + 16]).try_into().unwrap());
x += 16;
}
while x < count {
let center = center_base + x;
out[x] = if use_lf {
malta_unit_lf_interior(data, center, stride)
} else {
malta_unit_interior(data, center, stride)
};
x += 1;
}
};
malta_diff_map_impl(lum0, lum1, w_0gt1, w_0lt1, norm1, use_lf, pool, interior)
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn malta_unit_interior_8x_neon(
token: archmage::NeonToken,
data: &[f32],
center: usize,
stride: usize,
) -> magetypes::simd::f32x8 {
use magetypes::simd::f32x8;
let xs = stride as isize;
let xs2 = xs * 2;
let xs3 = xs * 3;
let xs4 = xs * 4;
let reach = 4 * stride + 4;
assert!(center >= reach && center + reach + 8 <= data.len());
macro_rules! ld {
($off:expr) => {{
let o: isize = $off;
let start = (center as isize + o) as usize;
f32x8::load(token, load_8(data, start))
}};
}
let mut r = f32x8::splat(token, 0.0);
{
let s = ld!(-4) + ld!(-3) + ld!(-2) + ld!(-1) + ld!(0) + ld!(1) + ld!(2) + ld!(3) + ld!(4);
r += s * s;
}
{
let s = ld!(-xs4)
+ ld!(-xs3)
+ ld!(-xs2)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2)
+ ld!(xs3)
+ ld!(xs4);
r += s * s;
}
{
let s = ld!(-xs3 - 3)
+ ld!(-xs2 - 2)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs2 + 2)
+ ld!(xs3 + 3);
r += s * s;
}
{
let s = ld!(-xs3 + 3)
+ ld!(-xs2 + 2)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs2 - 2)
+ ld!(xs3 - 3);
r += s * s;
}
{
let s = ld!(-xs4 + 1)
+ ld!(-xs3 + 1)
+ ld!(-xs2 + 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 - 1)
+ ld!(xs3 - 1)
+ ld!(xs4 - 1);
r += s * s;
}
{
let s = ld!(-xs4 - 1)
+ ld!(-xs3 - 1)
+ ld!(-xs2 - 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 + 1)
+ ld!(xs3 + 1)
+ ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-4 - xs)
+ ld!(-3 - xs)
+ ld!(-2 - xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 + xs)
+ ld!(3 + xs)
+ ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-4 + xs)
+ ld!(-3 + xs)
+ ld!(-2 + xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 - xs)
+ ld!(3 - xs)
+ ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-xs3 - 2)
+ ld!(-xs2 - 1)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs2 + 1)
+ ld!(xs3 + 2);
r += s * s;
}
{
let s = ld!(-xs3 + 2)
+ ld!(-xs2 + 1)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs2 - 1)
+ ld!(xs3 - 2);
r += s * s;
}
{
let s = ld!(-xs2 - 3)
+ ld!(-xs - 2)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs + 2)
+ ld!(xs2 + 3);
r += s * s;
}
{
let s = ld!(-xs2 + 3)
+ ld!(-xs + 2)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs - 2)
+ ld!(xs2 - 3);
r += s * s;
}
{
let s = ld!(-4 + xs)
+ ld!(-3 + xs)
+ ld!(-2 + xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 - xs)
+ ld!(3 - xs)
+ ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-4 - xs)
+ ld!(-3 - xs)
+ ld!(-2 - xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 + xs)
+ ld!(3 + xs)
+ ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-xs4 - 1)
+ ld!(-xs3 - 1)
+ ld!(-xs2 - 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 + 1)
+ ld!(xs3 + 1)
+ ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-xs4 + 1)
+ ld!(-xs3 + 1)
+ ld!(-xs2 + 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 - 1)
+ ld!(xs3 - 1)
+ ld!(xs4 - 1);
r += s * s;
}
r
}
#[cfg(target_arch = "aarch64")]
#[archmage::rite]
fn malta_unit_lf_interior_8x_neon(
token: archmage::NeonToken,
data: &[f32],
center: usize,
stride: usize,
) -> magetypes::simd::f32x8 {
use magetypes::simd::f32x8;
let xs = stride as isize;
let xs2 = xs * 2;
let xs3 = xs * 3;
let xs4 = xs * 4;
let reach = 4 * stride + 4;
assert!(center >= reach && center + reach + 8 <= data.len());
macro_rules! ld {
($off:expr) => {{
let o: isize = $off;
let start = (center as isize + o) as usize;
f32x8::load(token, load_8(data, start))
}};
}
let mut r = f32x8::splat(token, 0.0);
{
let s = ld!(-4) + ld!(-2) + ld!(0) + ld!(2) + ld!(4);
r += s * s;
}
{
let s = ld!(-xs4) + ld!(-xs2) + ld!(0) + ld!(xs2) + ld!(xs4);
r += s * s;
}
{
let s = ld!(-xs3 - 3) + ld!(-xs2 - 2) + ld!(0) + ld!(xs2 + 2) + ld!(xs3 + 3);
r += s * s;
}
{
let s = ld!(-xs3 + 3) + ld!(-xs2 + 2) + ld!(0) + ld!(xs2 - 2) + ld!(xs3 - 3);
r += s * s;
}
{
let s = ld!(-xs4 + 1) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs4 - 1);
r += s * s;
}
{
let s = ld!(-xs4 - 1) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-4 - xs) + ld!(-2 - xs) + ld!(0) + ld!(2 + xs) + ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-4 + xs) + ld!(-2 + xs) + ld!(0) + ld!(2 - xs) + ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-xs3 - 2) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs3 + 2);
r += s * s;
}
{
let s = ld!(-xs3 + 2) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs3 - 2);
r += s * s;
}
{
let s = ld!(-xs2 - 3) + ld!(-xs - 2) + ld!(0) + ld!(xs + 2) + ld!(xs2 + 3);
r += s * s;
}
{
let s = ld!(-xs2 + 3) + ld!(-xs + 2) + ld!(0) + ld!(xs - 2) + ld!(xs2 - 3);
r += s * s;
}
{
let s = ld!(-4 + xs2) + ld!(-2 + xs) + ld!(0) + ld!(2 - xs) + ld!(4 - xs2);
r += s * s;
}
{
let s = ld!(-4 - xs2) + ld!(-2 - xs) + ld!(0) + ld!(2 + xs) + ld!(4 + xs2);
r += s * s;
}
{
let s = ld!(-xs4 - 2) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs4 + 2);
r += s * s;
}
{
let s = ld!(-xs4 + 2) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs4 - 2);
r += s * s;
}
r
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments)]
fn malta_diff_map_dispatch_neon(
token: archmage::NeonToken,
lum0: &ImageF,
lum1: &ImageF,
w_0gt1: f64,
w_0lt1: f64,
norm1: f64,
use_lf: bool,
pool: &BufferPool,
) -> ImageF {
let interior = |data: &[f32],
center_base: usize,
stride: usize,
count: usize,
use_lf: bool,
out: &mut [f32]| {
let mut x = 0;
while x + 8 <= count {
let center = center_base + x;
let results = if use_lf {
malta_unit_lf_interior_8x_neon(token, data, center, stride)
} else {
malta_unit_interior_8x_neon(token, data, center, stride)
};
results.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < count {
let center = center_base + x;
out[x] = if use_lf {
malta_unit_lf_interior(data, center, stride)
} else {
malta_unit_interior(data, center, stride)
};
x += 1;
}
};
malta_diff_map_impl(lum0, lum1, w_0gt1, w_0lt1, norm1, use_lf, pool, interior)
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn malta_unit_interior_8x_wasm128(
token: archmage::Wasm128Token,
data: &[f32],
center: usize,
stride: usize,
) -> magetypes::simd::f32x8 {
use magetypes::simd::f32x8;
let xs = stride as isize;
let xs2 = xs * 2;
let xs3 = xs * 3;
let xs4 = xs * 4;
let reach = 4 * stride + 4;
assert!(center >= reach && center + reach + 8 <= data.len());
macro_rules! ld {
($off:expr) => {{
let o: isize = $off;
let start = (center as isize + o) as usize;
f32x8::load(token, load_8(data, start))
}};
}
let mut r = f32x8::splat(token, 0.0);
{
let s = ld!(-4) + ld!(-3) + ld!(-2) + ld!(-1) + ld!(0) + ld!(1) + ld!(2) + ld!(3) + ld!(4);
r += s * s;
}
{
let s = ld!(-xs4)
+ ld!(-xs3)
+ ld!(-xs2)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2)
+ ld!(xs3)
+ ld!(xs4);
r += s * s;
}
{
let s = ld!(-xs3 - 3)
+ ld!(-xs2 - 2)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs2 + 2)
+ ld!(xs3 + 3);
r += s * s;
}
{
let s = ld!(-xs3 + 3)
+ ld!(-xs2 + 2)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs2 - 2)
+ ld!(xs3 - 3);
r += s * s;
}
{
let s = ld!(-xs4 + 1)
+ ld!(-xs3 + 1)
+ ld!(-xs2 + 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 - 1)
+ ld!(xs3 - 1)
+ ld!(xs4 - 1);
r += s * s;
}
{
let s = ld!(-xs4 - 1)
+ ld!(-xs3 - 1)
+ ld!(-xs2 - 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 + 1)
+ ld!(xs3 + 1)
+ ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-4 - xs)
+ ld!(-3 - xs)
+ ld!(-2 - xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 + xs)
+ ld!(3 + xs)
+ ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-4 + xs)
+ ld!(-3 + xs)
+ ld!(-2 + xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 - xs)
+ ld!(3 - xs)
+ ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-xs3 - 2)
+ ld!(-xs2 - 1)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs2 + 1)
+ ld!(xs3 + 2);
r += s * s;
}
{
let s = ld!(-xs3 + 2)
+ ld!(-xs2 + 1)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs2 - 1)
+ ld!(xs3 - 2);
r += s * s;
}
{
let s = ld!(-xs2 - 3)
+ ld!(-xs - 2)
+ ld!(-xs - 1)
+ ld!(0)
+ ld!(xs + 1)
+ ld!(xs + 2)
+ ld!(xs2 + 3);
r += s * s;
}
{
let s = ld!(-xs2 + 3)
+ ld!(-xs + 2)
+ ld!(-xs + 1)
+ ld!(0)
+ ld!(xs - 1)
+ ld!(xs - 2)
+ ld!(xs2 - 3);
r += s * s;
}
{
let s = ld!(-4 + xs)
+ ld!(-3 + xs)
+ ld!(-2 + xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 - xs)
+ ld!(3 - xs)
+ ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-4 - xs)
+ ld!(-3 - xs)
+ ld!(-2 - xs)
+ ld!(-1)
+ ld!(0)
+ ld!(1)
+ ld!(2 + xs)
+ ld!(3 + xs)
+ ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-xs4 - 1)
+ ld!(-xs3 - 1)
+ ld!(-xs2 - 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 + 1)
+ ld!(xs3 + 1)
+ ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-xs4 + 1)
+ ld!(-xs3 + 1)
+ ld!(-xs2 + 1)
+ ld!(-xs)
+ ld!(0)
+ ld!(xs)
+ ld!(xs2 - 1)
+ ld!(xs3 - 1)
+ ld!(xs4 - 1);
r += s * s;
}
r
}
#[cfg(target_arch = "wasm32")]
#[archmage::rite]
fn malta_unit_lf_interior_8x_wasm128(
token: archmage::Wasm128Token,
data: &[f32],
center: usize,
stride: usize,
) -> magetypes::simd::f32x8 {
use magetypes::simd::f32x8;
let xs = stride as isize;
let xs2 = xs * 2;
let xs3 = xs * 3;
let xs4 = xs * 4;
let reach = 4 * stride + 4;
assert!(center >= reach && center + reach + 8 <= data.len());
macro_rules! ld {
($off:expr) => {{
let o: isize = $off;
let start = (center as isize + o) as usize;
f32x8::load(token, load_8(data, start))
}};
}
let mut r = f32x8::splat(token, 0.0);
{
let s = ld!(-4) + ld!(-2) + ld!(0) + ld!(2) + ld!(4);
r += s * s;
}
{
let s = ld!(-xs4) + ld!(-xs2) + ld!(0) + ld!(xs2) + ld!(xs4);
r += s * s;
}
{
let s = ld!(-xs3 - 3) + ld!(-xs2 - 2) + ld!(0) + ld!(xs2 + 2) + ld!(xs3 + 3);
r += s * s;
}
{
let s = ld!(-xs3 + 3) + ld!(-xs2 + 2) + ld!(0) + ld!(xs2 - 2) + ld!(xs3 - 3);
r += s * s;
}
{
let s = ld!(-xs4 + 1) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs4 - 1);
r += s * s;
}
{
let s = ld!(-xs4 - 1) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs4 + 1);
r += s * s;
}
{
let s = ld!(-4 - xs) + ld!(-2 - xs) + ld!(0) + ld!(2 + xs) + ld!(4 + xs);
r += s * s;
}
{
let s = ld!(-4 + xs) + ld!(-2 + xs) + ld!(0) + ld!(2 - xs) + ld!(4 - xs);
r += s * s;
}
{
let s = ld!(-xs3 - 2) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs3 + 2);
r += s * s;
}
{
let s = ld!(-xs3 + 2) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs3 - 2);
r += s * s;
}
{
let s = ld!(-xs2 - 3) + ld!(-xs - 2) + ld!(0) + ld!(xs + 2) + ld!(xs2 + 3);
r += s * s;
}
{
let s = ld!(-xs2 + 3) + ld!(-xs + 2) + ld!(0) + ld!(xs - 2) + ld!(xs2 - 3);
r += s * s;
}
{
let s = ld!(-4 + xs2) + ld!(-2 + xs) + ld!(0) + ld!(2 - xs) + ld!(4 - xs2);
r += s * s;
}
{
let s = ld!(-4 - xs2) + ld!(-2 - xs) + ld!(0) + ld!(2 + xs) + ld!(4 + xs2);
r += s * s;
}
{
let s = ld!(-xs4 - 2) + ld!(-xs2 - 1) + ld!(0) + ld!(xs2 + 1) + ld!(xs4 + 2);
r += s * s;
}
{
let s = ld!(-xs4 + 2) + ld!(-xs2 + 1) + ld!(0) + ld!(xs2 - 1) + ld!(xs4 - 2);
r += s * s;
}
r
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments)]
fn malta_diff_map_dispatch_wasm128(
token: archmage::Wasm128Token,
lum0: &ImageF,
lum1: &ImageF,
w_0gt1: f64,
w_0lt1: f64,
norm1: f64,
use_lf: bool,
pool: &BufferPool,
) -> ImageF {
let interior = |data: &[f32],
center_base: usize,
stride: usize,
count: usize,
use_lf: bool,
out: &mut [f32]| {
let mut x = 0;
while x + 8 <= count {
let center = center_base + x;
let results = if use_lf {
malta_unit_lf_interior_8x_wasm128(token, data, center, stride)
} else {
malta_unit_interior_8x_wasm128(token, data, center, stride)
};
results.store((&mut out[x..x + 8]).try_into().unwrap());
x += 8;
}
while x < count {
let center = center_base + x;
out[x] = if use_lf {
malta_unit_lf_interior(data, center, stride)
} else {
malta_unit_interior(data, center, stride)
};
x += 1;
}
};
malta_diff_map_impl(lum0, lum1, w_0gt1, w_0lt1, norm1, use_lf, pool, interior)
}
#[allow(clippy::too_many_arguments)]
fn malta_diff_map_dispatch_scalar(
_token: archmage::ScalarToken,
lum0: &ImageF,
lum1: &ImageF,
w_0gt1: f64,
w_0lt1: f64,
norm1: f64,
use_lf: bool,
pool: &BufferPool,
) -> ImageF {
let interior = |data: &[f32],
center_base: usize,
stride: usize,
count: usize,
use_lf: bool,
out: &mut [f32]| {
for x in 0..count {
let center = center_base + x;
out[x] = if use_lf {
malta_unit_lf_interior(data, center, stride)
} else {
malta_unit_interior(data, center, stride)
};
}
};
malta_diff_map_impl(lum0, lum1, w_0gt1, w_0lt1, norm1, use_lf, pool, interior)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_malta_uniform() {
let img = ImageF::filled(32, 32, 1.0);
let center = malta_unit(&img, 16, 16);
assert!(center > 0.0, "Malta should be positive for uniform image");
}
#[test]
fn test_malta_edge() {
let mut img = ImageF::new(32, 32);
for y in 0..32 {
for x in 0..32 {
if x < 16 {
img.set(x, y, 0.0);
} else {
img.set(x, y, 1.0);
}
}
}
let edge = malta_unit(&img, 16, 16); let uniform = malta_unit(&img, 8, 16);
assert!(
(edge - uniform).abs() > 1e-6,
"Malta should differ at edge vs uniform region"
);
}
#[test]
fn test_malta_diff_map_identical() {
let img = ImageF::filled(32, 32, 0.5);
let pool = BufferPool::new();
let result = malta_diff_map(&img, &img, 1.0, 1.0, 1.0, false, &pool);
let mut sum = 0.0;
for y in 0..32 {
for x in 0..32 {
sum += result.get(x, y);
}
}
assert!(sum.abs() < 1e-6, "Identical images should have zero diff");
}
#[test]
fn test_malta_lf_smaller() {
let img = ImageF::filled(32, 32, 1.0);
let hf = malta_unit(&img, 16, 16);
let lf = malta_unit_lf(&img, 16, 16);
assert!(hf > 0.0);
assert!(lf > 0.0);
assert!(lf < hf, "LF should have smaller response for uniform image");
}
#[test]
fn test_malta_fast_vs_slow() {
let mut img2 = ImageF::new(32, 32);
for y in 0..32 {
for x in 0..32 {
img2.set(x, y, ((x + y) % 10) as f32 * 0.1);
}
}
let fast_result = malta_unit(&img2, 16, 16);
assert!(fast_result >= 0.0, "Malta result should be non-negative");
}
#[test]
fn test_interior_vs_window() {
let mut img = ImageF::new(32, 32);
for y in 0..32 {
for x in 0..32 {
img.set(x, y, ((x * 7 + y * 13) % 100) as f32 * 0.01);
}
}
let data = img.data();
let stride = img.stride();
for y in 5..27 {
for x in 5..27 {
let window_result = malta_unit(&img, x, y);
let center = y * stride + x;
let interior_result = malta_unit_interior(data, center, stride);
let diff = (window_result - interior_result).abs();
assert!(
diff < 1e-6,
"HF mismatch at ({x}, {y}): window={window_result}, interior={interior_result}, diff={diff}"
);
}
}
for y in 5..27 {
for x in 5..27 {
let window_result = malta_unit_lf(&img, x, y);
let center = y * stride + x;
let interior_result = malta_unit_lf_interior(data, center, stride);
let diff = (window_result - interior_result).abs();
assert!(
diff < 1e-6,
"LF mismatch at ({x}, {y}): window={window_result}, interior={interior_result}, diff={diff}"
);
}
}
}
}