#![allow(dead_code)]
use crate::foundation::aligned_alloc::{AlignedVec, AllocError, try_alloc_zeroed};
use archmage::prelude::*;
use magetypes::simd::backends::F32x8Backend;
use magetypes::simd::generic::f32x8 as GenericF32x8;
#[inline(always)]
fn load_f32x8<T: F32x8Backend>(token: T, slice: &[f32], offset: usize) -> GenericF32x8<T> {
GenericF32x8::<T>::load(token, slice[offset..offset + 8].try_into().unwrap())
}
#[inline(always)]
fn store_f32x8<T: F32x8Backend>(slice: &mut [f32], offset: usize, value: GenericF32x8<T>) {
value.store(<&mut [f32; 8]>::try_from(&mut slice[offset..offset + 8]).unwrap())
}
const K_INPUT_SCALING: f32 = 1.0 / 255.0;
const K_EPSILON_RATIO: f32 = 1e-2;
const K_NUM_OFFSET_RATIO: f32 = K_EPSILON_RATIO / K_INPUT_SCALING / K_INPUT_SCALING;
const K_SG_MUL: f32 = 226.0480446705883;
const K_SG_MUL2: f32 = 1.0 / 73.377132366608819;
const K_INV_LOG2E: f32 = 0.6931471805599453;
const K_SG_RET_MUL: f32 = K_SG_MUL2 * 18.6580932135 * K_INV_LOG2E;
const K_NUM_MUL_RATIO: f32 = K_SG_RET_MUL * 3.0 * K_SG_MUL;
const K_SG_VOFFSET: f32 = 7.14672470003;
const K_VOFFSET_RATIO: f32 = (K_SG_VOFFSET * K_INV_LOG2E + K_EPSILON_RATIO) / K_INPUT_SCALING;
const K_DEN_MUL_RATIO: f32 = K_INV_LOG2E * K_SG_MUL * K_INPUT_SCALING * K_INPUT_SCALING;
const K_MASKING_LOG_OFFSET: f32 = 28.0;
const K_MASKING_MUL: f32 = 211.50759899638012;
#[inline]
fn ratio_of_derivatives_scalar(val: f32, invert: bool) -> f32 {
let v = val.max(0.0);
let v2 = v * v;
let num = v2.mul_add(K_NUM_MUL_RATIO, K_NUM_OFFSET_RATIO);
let den = (v * K_DEN_MUL_RATIO).mul_add(v2, K_VOFFSET_RATIO);
let safe_den = if den == 0.0 { 1e-9 } else { den };
if invert {
num / safe_den
} else {
safe_den / num
}
}
#[inline]
fn masking_sqrt_scalar(v: f32) -> f32 {
0.25 * v
.mul_add((K_MASKING_MUL * 1e8_f32).sqrt(), K_MASKING_LOG_OFFSET)
.sqrt()
}
#[inline(always)]
fn ratio_of_derivatives_x8_generic<T: F32x8Backend>(
token: T,
vals: GenericF32x8<T>,
) -> GenericF32x8<T> {
let v = vals.max(GenericF32x8::<T>::zero(token));
let v2 = v * v;
let num = v2.mul_add(
GenericF32x8::<T>::splat(token, K_NUM_MUL_RATIO),
GenericF32x8::<T>::splat(token, K_NUM_OFFSET_RATIO),
);
let den = (v * GenericF32x8::<T>::splat(token, K_DEN_MUL_RATIO))
.mul_add(v2, GenericF32x8::<T>::splat(token, K_VOFFSET_RATIO));
den / num
}
#[inline(always)]
fn ratio_of_derivatives_inv_x8_generic<T: F32x8Backend>(
token: T,
vals: GenericF32x8<T>,
) -> GenericF32x8<T> {
let v = vals.max(GenericF32x8::<T>::zero(token));
let v2 = v * v;
let num = v2.mul_add(
GenericF32x8::<T>::splat(token, K_NUM_MUL_RATIO),
GenericF32x8::<T>::splat(token, K_NUM_OFFSET_RATIO),
);
let den = (v * GenericF32x8::<T>::splat(token, K_DEN_MUL_RATIO))
.mul_add(v2, GenericF32x8::<T>::splat(token, K_VOFFSET_RATIO));
num / den
}
#[inline(always)]
fn masking_sqrt_x8_generic<T: F32x8Backend>(token: T, v: GenericF32x8<T>) -> GenericF32x8<T> {
let k_mul_sqrt = GenericF32x8::<T>::splat(token, (K_MASKING_MUL * 1e8_f32).sqrt());
let k_offset = GenericF32x8::<T>::splat(token, K_MASKING_LOG_OFFSET);
GenericF32x8::<T>::splat(token, 0.25) * v.mul_add(k_mul_sqrt, k_offset).sqrt()
}
const LIMIT: f32 = 0.2;
const MATCH_GAMMA_OFFSET: f32 = 0.019;
const GAMMA_OFFSET: f32 = MATCH_GAMMA_OFFSET / K_INPUT_SCALING;
#[inline(always)]
fn pre_erosion_pixel_x8_generic<T: F32x8Backend>(
token: T,
pixels: GenericF32x8<T>,
left: GenericF32x8<T>,
right: GenericF32x8<T>,
top: GenericF32x8<T>,
bottom: GenericF32x8<T>,
) -> GenericF32x8<T> {
let base = GenericF32x8::<T>::splat(token, 0.25) * (left + right + top + bottom);
let ratio = ratio_of_derivatives_x8_generic(
token,
pixels + GenericF32x8::<T>::splat(token, GAMMA_OFFSET),
);
let diff = ratio * (pixels - base);
let diff_sq = (diff * diff).min(GenericF32x8::<T>::splat(token, LIMIT));
masking_sqrt_x8_generic(token, diff_sq)
}
pub fn pre_erosion_row(row: &[f32], row_above: &[f32], row_below: &[f32], output: &mut [f32]) {
incant!(pre_erosion_row_impl(row, row_above, row_below, output));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn pre_erosion_row_impl(
token: Token,
row: &[f32],
row_above: &[f32],
row_below: &[f32],
output: &mut [f32],
) {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
let width = row.len();
assert_eq!(row_above.len(), width);
assert_eq!(row_below.len(), width);
assert_eq!(output.len(), width);
if width == 0 {
return;
}
let chunks = width / 8;
for chunk in 0..chunks {
let x = chunk * 8;
let pixels = load_f32x8(token, row, x);
let left = if x == 0 {
f32x8::from_array(
token,
[
row[0],
row[x],
row[x + 1],
row[x + 2],
row[x + 3],
row[x + 4],
row[x + 5],
row[x + 6],
],
)
} else {
load_f32x8(token, row, x - 1)
};
let right = if x + 8 >= width {
let last = width - 1;
f32x8::from_array(
token,
[
row[(x + 1).min(last)],
row[(x + 2).min(last)],
row[(x + 3).min(last)],
row[(x + 4).min(last)],
row[(x + 5).min(last)],
row[(x + 6).min(last)],
row[(x + 7).min(last)],
row[(x + 8).min(last)],
],
)
} else {
load_f32x8(token, row, x + 1)
};
let top = load_f32x8(token, row_above, x);
let bottom = load_f32x8(token, row_below, x);
let result = pre_erosion_pixel_x8_generic(token, pixels, left, right, top, bottom);
let existing = load_f32x8(token, output, x);
store_f32x8(output, x, existing + result);
}
for x in (chunks * 8)..width {
let pixel = row[x];
let left_val = if x == 0 { row[0] } else { row[x - 1] };
let right_val = if x == width - 1 {
row[width - 1]
} else {
row[x + 1]
};
let top_val = row_above[x];
let bottom_val = row_below[x];
let base = 0.25 * (left_val + right_val + top_val + bottom_val);
let ratio = ratio_of_derivatives_scalar(pixel + GAMMA_OFFSET, false);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
let masked = masking_sqrt_scalar(diff_sq);
output[x] += masked;
}
}
pub fn pre_erosion_row_padded(
row: &[f32],
row_above: &[f32],
row_below: &[f32],
width: usize,
output: &mut [f32],
) {
incant!(pre_erosion_row_padded_impl(
row, row_above, row_below, width, output
));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn pre_erosion_row_padded_impl(
token: Token,
row: &[f32],
row_above: &[f32],
row_below: &[f32],
width: usize,
output: &mut [f32],
) {
debug_assert_eq!(row.len(), width + 2);
debug_assert_eq!(row_above.len(), width + 2);
debug_assert_eq!(row_below.len(), width + 2);
debug_assert_eq!(output.len(), width);
if width == 0 {
return;
}
let chunks = width / 8;
for chunk in 0..chunks {
let x = chunk * 8;
let buf_x = x + 1;
let pixels = load_f32x8(token, row, buf_x);
let left = load_f32x8(token, row, buf_x - 1); let right = load_f32x8(token, row, buf_x + 1); let top = load_f32x8(token, row_above, buf_x);
let bottom = load_f32x8(token, row_below, buf_x);
let result = pre_erosion_pixel_x8_generic(token, pixels, left, right, top, bottom);
let existing = load_f32x8(token, output, x);
store_f32x8(output, x, existing + result);
}
for x in (chunks * 8)..width {
let buf_x = x + 1;
let pixel = row[buf_x];
let left_val = row[buf_x - 1]; let right_val = row[buf_x + 1]; let top_val = row_above[buf_x];
let bottom_val = row_below[buf_x];
let base = 0.25 * (left_val + right_val + top_val + bottom_val);
let ratio = ratio_of_derivatives_scalar(pixel + GAMMA_OFFSET, false);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
let masked = masking_sqrt_scalar(diff_sq);
output[x] += masked;
}
}
pub fn compute_pre_erosion_simd(
input: &[f32],
width: usize,
height: usize,
) -> Result<AlignedVec<f32>, AllocError> {
let pre_erosion_w = (width + 3) / 4;
let pre_erosion_h = (height + 3) / 4;
let mut pre_erosion = try_alloc_zeroed(pre_erosion_w * pre_erosion_h)?;
if width == 0 || height == 0 {
return Ok(pre_erosion);
}
let mut diff_buffer = try_alloc_zeroed(width)?;
for y_block in 0..pre_erosion_h {
diff_buffer.fill(0.0);
for iy in 0..4 {
let y = y_block * 4 + iy;
if y >= height {
continue;
}
let row = &input[y * width..(y + 1) * width];
let row_above = if y == 0 {
row
} else {
&input[(y - 1) * width..y * width]
};
let row_below = if y + 1 >= height {
row
} else {
&input[(y + 1) * width..(y + 2) * width]
};
pre_erosion_row(row, row_above, row_below, &mut diff_buffer);
}
let out_row = &mut pre_erosion[y_block * pre_erosion_w..(y_block + 1) * pre_erosion_w];
downsample_4x_sum(&diff_buffer, out_row);
}
Ok(pre_erosion)
}
fn downsample_4x_sum(input: &[f32], output: &mut [f32]) {
incant!(downsample_4x_sum_impl(input, output));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn downsample_4x_sum_impl(token: Token, input: &[f32], output: &mut [f32]) {
let _ = token; let width = input.len();
let out_w = output.len();
let chunks = (width / 32).min(out_w / 8);
for chunk in 0..chunks {
let out_x = chunk * 8;
let in_x = out_x * 4;
let mut sums = [0.0f32; 8];
for i in 0..8 {
let base = in_x + i * 4;
sums[i] = (input[base] + input[base + 1] + input[base + 2] + input[base + 3]) * 0.25;
}
output[out_x..out_x + 8].copy_from_slice(&sums);
}
for out_x in (chunks * 8)..out_w {
let in_x = out_x * 4;
let mut sum = 0.0f32;
for i in 0..4 {
if in_x + i < width {
sum += input[in_x + i];
}
}
output[out_x] = sum * 0.25;
}
}
const K_BIAS: f32 = 0.16 / K_INPUT_SCALING;
#[inline(always)]
fn fast_exp2(x: f32) -> f32 {
let x = x.clamp(-126.0, 127.0);
let xi = x.floor();
let xf = x - xi;
let p = 1.0
+ xf * (0.6931471805599453 + xf * (0.24022650695910071 + xf * (0.055504108664821579 + xf * 0.009618129107628477)));
let xi_i32 = xi as i32;
let bits = ((xi_i32 + 127) as u32) << 23;
f32::from_bits(bits) * p
}
#[inline(always)]
fn fast_log2(x: f32) -> f32 {
if x <= 0.0 {
return f32::NEG_INFINITY;
}
let bits = x.to_bits() as i32;
let e = (bits >> 23) - 127;
let f = f32::from_bits((bits & 0x007FFFFF) as u32 | 0x3F800000);
let f_minus_1 = f - 1.0;
let t = f_minus_1;
let log2_f = t * 0.2885390082_f32
.mul_add(t, -0.3606737602)
.mul_add(t, 0.4808983470)
.mul_add(t, -0.7213475204)
.mul_add(t, 1.442695041);
e as f32 + log2_f
}
#[inline(always)]
fn gamma_modulation_sum_8x8_generic<T: F32x8Backend>(
token: T,
block: &[f32],
stride: usize,
_block_x: usize,
block_y: usize,
_img_width: usize,
img_height: usize,
) -> f32 {
let bias = GenericF32x8::<T>::splat(token, K_BIAS);
let mut sum = GenericF32x8::<T>::zero(token);
for dy in 0..8 {
let y = block_y + dy;
if y >= img_height {
continue;
}
let row_start = dy * stride;
if row_start + 8 <= block.len() {
let row = load_f32x8(token, block, row_start);
let ratio = ratio_of_derivatives_inv_x8_generic(token, row + bias);
sum += ratio;
}
}
sum.reduce_add()
}
#[inline(always)]
fn hf_modulation_sum_8x8_generic<T: F32x8Backend>(
token: T,
block: &[f32],
stride: usize,
_block_x: usize,
block_y: usize,
_img_width: usize,
img_height: usize,
) -> f32 {
let mask_first_7 =
GenericF32x8::<T>::from_array(token, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]);
let mut h_sum = GenericF32x8::<T>::zero(token);
let mut v_sum = GenericF32x8::<T>::zero(token);
for dy in 0..8 {
let y = block_y + dy;
if y >= img_height {
continue;
}
let row_start = dy * stride;
if row_start + 9 <= block.len() {
let p = load_f32x8(token, block, row_start);
let p_right = load_f32x8(token, block, row_start + 1);
h_sum += (p - p_right).abs() * mask_first_7;
}
if dy < 7 && y + 1 < img_height {
let next_row_start = (dy + 1) * stride;
if row_start + 8 <= block.len() && next_row_start + 8 <= block.len() {
let p = load_f32x8(token, block, row_start);
let p_below = load_f32x8(token, block, next_row_start);
v_sum += (p - p_below).abs();
}
}
}
h_sum.reduce_add() + v_sum.reduce_add()
}
pub fn per_block_modulations_simd(
y_quant_01: f32,
input: &[f32],
width: usize,
height: usize,
block_w: usize,
block_h: usize,
aq_map: &mut [f32],
) {
const K_AC_QUANT: f32 = 0.841;
const K_DAMPEN_RAMP_START: f32 = 9.0;
const K_DAMPEN_RAMP_END: f32 = 65.0;
let base_level = 0.48 * K_AC_QUANT;
let dampen = if y_quant_01 >= K_DAMPEN_RAMP_START {
let d =
1.0 - (y_quant_01 - K_DAMPEN_RAMP_START) / (K_DAMPEN_RAMP_END - K_DAMPEN_RAMP_START);
d.max(0.0)
} else {
1.0
};
let mul = K_AC_QUANT * dampen;
let add = (1.0 - dampen) * base_level;
for by in 0..block_h {
let row_start = by * block_w;
let row_end = row_start + block_w;
per_block_modulations_row(
input,
width, width, height, by,
block_w,
&mut aq_map[row_start..row_end],
mul,
add,
);
}
}
pub fn per_block_modulations_row(
input: &[f32],
stride: usize,
img_width: usize,
img_height: usize,
by: usize,
block_w: usize,
aq_row: &mut [f32],
mul: f32,
add: f32,
) {
incant!(per_block_modulations_row_impl(
input, stride, img_width, img_height, by, block_w, aq_row, mul, add
));
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn per_block_modulations_row_impl(
token: Token,
input: &[f32],
stride: usize,
img_width: usize,
img_height: usize,
by: usize,
block_w: usize,
aq_row: &mut [f32],
mul: f32,
add: f32,
) {
const K_SUM_COEFF: f32 = -2.0052193233688884 * K_INPUT_SCALING / 112.0;
const K_GAMMA: f32 = -0.15526878023684174 * K_INV_LOG2E;
const K_SCALE: f32 = K_INPUT_SCALING / 64.0;
const LOG2_E: f32 = 1.442695041;
const K_MASK_BASE: f32 = -0.74174993;
const K_MASK_MUL4: f32 = 3.2353257320940401;
const K_MASK_MUL2: f32 = 12.906028311180409;
const K_MASK_OFFSET2: f32 = 305.04035728311436;
const K_MASK_MUL3: f32 = 5.0220313103171232;
const K_MASK_OFFSET3: f32 = 2.1925739705298404;
const K_MASK_OFFSET4: f32 = 0.25 * K_MASK_OFFSET3;
const K_MASK_MUL0: f32 = 0.74760422233706747;
let y_start = by * 8;
for bx in 0..block_w {
let x_start = bx * 8;
let fuzzy_val = aq_row[bx];
let v1 = (fuzzy_val * K_MASK_MUL0).max(1e-3);
let v2 = 1.0 / (v1 + K_MASK_OFFSET2);
let v3 = 1.0 / (v1 * v1 + K_MASK_OFFSET3);
let v4 = 1.0 / (v1 * v1 + K_MASK_OFFSET4);
let mut out_val = K_MASK_MUL4.mul_add(
v4,
K_MASK_MUL2.mul_add(v2, K_MASK_MUL3.mul_add(v3, K_MASK_BASE)),
);
let block_offset = y_start * stride + x_start;
let block = &input[block_offset..];
let hf_sum = hf_modulation_sum_8x8_generic(
token, block, stride, x_start, y_start, img_width, img_height,
);
out_val += hf_sum * K_SUM_COEFF;
let gamma_sum = gamma_modulation_sum_8x8_generic(
token, block, stride, x_start, y_start, img_width, img_height,
);
let overall_ratio = gamma_sum * K_SCALE;
let log_ratio = if overall_ratio > 0.0 {
fast_log2(overall_ratio)
} else {
0.0
};
out_val += K_GAMMA * log_ratio;
let quant_field = fast_exp2(out_val * LOG2_E).mul_add(mul, add);
aq_row[bx] = quant_field;
}
}
const FUZZY_MUL0: f32 = 0.125;
const FUZZY_MUL1: f32 = 0.075;
const FUZZY_MUL2: f32 = 0.06;
const FUZZY_MUL3: f32 = 0.05;
#[inline(always)]
fn cas<T: F32x8Backend>(
a: GenericF32x8<T>,
b: GenericF32x8<T>,
) -> (GenericF32x8<T>, GenericF32x8<T>) {
(a.min(b), a.max(b))
}
#[inline(always)]
fn weighted_min4_of_9_simd<T: F32x8Backend>(
token: T,
mut v: [GenericF32x8<T>; 9],
) -> GenericF32x8<T> {
(v[0], v[1]) = cas(v[0], v[1]);
(v[2], v[3]) = cas(v[2], v[3]);
(v[4], v[5]) = cas(v[4], v[5]);
(v[6], v[7]) = cas(v[6], v[7]);
(v[0], v[2]) = cas(v[0], v[2]);
(v[1], v[3]) = cas(v[1], v[3]);
(v[4], v[6]) = cas(v[4], v[6]);
(v[5], v[7]) = cas(v[5], v[7]);
(v[0], v[4]) = cas(v[0], v[4]);
(v[1], v[5]) = cas(v[1], v[5]);
(v[2], v[6]) = cas(v[2], v[6]);
(v[3], v[7]) = cas(v[3], v[7]);
(v[0], v[8]) = cas(v[0], v[8]);
(v[4], v[8]) = cas(v[4], v[8]);
(v[1], v[2]) = cas(v[1], v[2]);
(v[1], v[4]) = cas(v[1], v[4]);
(v[2], v[4]) = cas(v[2], v[4]);
(v[3], v[4]) = cas(v[3], v[4]);
(v[5], v[8]) = cas(v[5], v[8]);
let mul0 = GenericF32x8::<T>::splat(token, FUZZY_MUL0);
let mul1 = GenericF32x8::<T>::splat(token, FUZZY_MUL1);
let mul2 = GenericF32x8::<T>::splat(token, FUZZY_MUL2);
let mul3 = GenericF32x8::<T>::splat(token, FUZZY_MUL3);
mul0 * v[0] + mul1 * v[1] + mul2 * v[2] + mul3 * v[3]
}
#[inline]
pub fn fuzzy_erosion_row_simd(
pre_erosion: &[f32],
pre_erosion_w: usize,
y: usize,
max_y: usize,
out: &mut [f32],
) -> usize {
incant!(fuzzy_erosion_row_simd_impl(
pre_erosion,
pre_erosion_w,
y,
max_y,
out
))
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn fuzzy_erosion_row_simd_impl(
token: Token,
pre_erosion: &[f32],
pre_erosion_w: usize,
y: usize,
max_y: usize,
out: &mut [f32],
) -> usize {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
let row_above_y = (y as isize - 1).clamp(0, max_y as isize) as usize;
let row_below_y = (y + 1).min(max_y);
let row_above = &pre_erosion[row_above_y * pre_erosion_w..];
let row_curr = &pre_erosion[y * pre_erosion_w..];
let row_below = &pre_erosion[row_below_y * pre_erosion_w..];
let mut x = 1;
while x + 8 <= pre_erosion_w.saturating_sub(1) {
let v = [
load_f32x8(token, row_above, x - 1), load_f32x8(token, row_above, x), load_f32x8(token, row_above, x + 1), load_f32x8(token, row_curr, x - 1), load_f32x8(token, row_curr, x), load_f32x8(token, row_curr, x + 1), load_f32x8(token, row_below, x - 1), load_f32x8(token, row_below, x), load_f32x8(token, row_below, x + 1), ];
let result = weighted_min4_of_9_simd(token, v);
store_f32x8(out, x, result);
x += 8;
}
while x < pre_erosion_w.saturating_sub(1) {
let x_left = x - 1;
let x_right = x + 1;
let vals = [
row_above[x_left],
row_above[x],
row_above[x_right],
row_curr[x_left],
row_curr[x],
row_curr[x_right],
row_below[x_left],
row_below[x],
row_below[x_right],
];
out[x] = weighted_min4_of_9(vals);
x += 1;
}
{
let vals = [
row_above[0],
row_above[0],
row_above[1.min(pre_erosion_w - 1)],
row_curr[0],
row_curr[0],
row_curr[1.min(pre_erosion_w - 1)],
row_below[0],
row_below[0],
row_below[1.min(pre_erosion_w - 1)],
];
out[0] = weighted_min4_of_9(vals);
}
if pre_erosion_w > 1 {
let x = pre_erosion_w - 1;
let vals = [
row_above[x - 1],
row_above[x],
row_above[x],
row_curr[x - 1],
row_curr[x],
row_curr[x],
row_below[x - 1],
row_below[x],
row_below[x],
];
out[x] = weighted_min4_of_9(vals);
}
pre_erosion_w
}
#[inline(always)]
fn weighted_min4_of_9(v: [f32; 9]) -> f32 {
let mut a = v;
for i in 0..4 {
for j in (i + 1)..9 {
if a[j] < a[i] {
a.swap(i, j);
}
}
}
FUZZY_MUL0.mul_add(
a[0],
FUZZY_MUL1.mul_add(a[1], FUZZY_MUL2.mul_add(a[2], FUZZY_MUL3 * a[3])),
)
}
#[inline(always)]
fn gather_neighbor_circular<T: F32x8Backend>(
token: T,
buffer: &[f32],
pe_w: usize,
buffer_rows: usize,
base_cx: [isize; 8],
cy: isize,
nx: isize,
ny: isize,
max_x: isize,
max_y: isize,
) -> GenericF32x8<T> {
let py = (cy + ny).clamp(0, max_y) as usize;
let buffer_row = py % buffer_rows;
let row_offset = buffer_row * pe_w;
let vals: [f32; 8] = std::array::from_fn(|i| {
let px = (base_cx[i] + nx).clamp(0, max_x) as usize;
let idx = row_offset + px;
if idx < buffer.len() { buffer[idx] } else { 0.0 }
});
GenericF32x8::<T>::from_array(token, vals)
}
#[inline]
pub fn compute_fuzzy_erosion_blocks_simd(
pre_erosion_buffer: &[f32],
pe_w: usize,
buffer_rows: usize,
pe_y_base: isize,
max_filled_row: isize,
start: usize,
end: usize,
out: &mut [f32],
) -> usize {
incant!(compute_fuzzy_erosion_blocks_simd_impl(
pre_erosion_buffer,
pe_w,
buffer_rows,
pe_y_base,
max_filled_row,
start,
end,
out
))
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn compute_fuzzy_erosion_blocks_simd_impl(
token: Token,
pre_erosion_buffer: &[f32],
pe_w: usize,
buffer_rows: usize,
pe_y_base: isize,
max_filled_row: isize,
start: usize,
end: usize,
out: &mut [f32],
) -> usize {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
let max_x = pe_w as isize - 1;
let max_y = max_filled_row.max(0);
let mut processed = 0;
let mut bx = start;
while bx + 8 <= end {
let base_cx: [isize; 8] = std::array::from_fn(|i| ((bx + i - start) * 2) as isize);
let mut sum = f32x8::zero(token);
for dy in 0..2isize {
for dx in 0..2isize {
let cy = pe_y_base + dy;
let cx: [isize; 8] = std::array::from_fn(|i| base_cx[i] + dx);
let v = [
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
-1,
-1,
max_x,
max_y,
),
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
0,
-1,
max_x,
max_y,
),
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
1,
-1,
max_x,
max_y,
),
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
-1,
0,
max_x,
max_y,
),
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
0,
0,
max_x,
max_y,
),
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
1,
0,
max_x,
max_y,
),
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
-1,
1,
max_x,
max_y,
),
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
0,
1,
max_x,
max_y,
),
gather_neighbor_circular(
token,
pre_erosion_buffer,
pe_w,
buffer_rows,
cx,
cy,
1,
1,
max_x,
max_y,
),
];
sum += weighted_min4_of_9_simd(token, v);
}
}
let sum_arr = sum.to_array();
out[bx..bx + 8].copy_from_slice(&sum_arr);
bx += 8;
processed += 8;
}
processed
}
pub fn fuzzy_erosion_simd(
pre_erosion: &[f32],
pre_erosion_w: usize,
pre_erosion_h: usize,
block_w: usize,
block_h: usize,
aq_map: &mut [f32],
) -> Result<(), AllocError> {
assert_eq!(aq_map.len(), block_w * block_h);
let mut tmp = try_alloc_zeroed(pre_erosion_w * pre_erosion_h)?;
for y in 0..pre_erosion_h {
let row_above_y = (y as isize - 1).clamp(0, pre_erosion_h as isize - 1) as usize;
let row_below_y = (y as isize + 1).clamp(0, pre_erosion_h as isize - 1) as usize;
let row_above =
&pre_erosion[row_above_y * pre_erosion_w..(row_above_y + 1) * pre_erosion_w];
let row_curr = &pre_erosion[y * pre_erosion_w..(y + 1) * pre_erosion_w];
let row_below =
&pre_erosion[row_below_y * pre_erosion_w..(row_below_y + 1) * pre_erosion_w];
for x in 0..pre_erosion_w {
let ix = x as isize;
let x_left = (ix - 1).clamp(0, pre_erosion_w as isize - 1) as usize;
let x_right = (ix + 1).clamp(0, pre_erosion_w as isize - 1) as usize;
let vals = [
row_above[x_left], row_above[x], row_above[x_right], row_curr[x_left], row_curr[x], row_curr[x_right], row_below[x_left], row_below[x], row_below[x_right], ];
tmp[y * pre_erosion_w + x] = weighted_min4_of_9(vals);
}
}
sum_2x2_blocks_simd(&tmp, pre_erosion_w, pre_erosion_h, block_w, block_h, aq_map);
Ok(())
}
#[inline(always)]
fn sum_2x2_blocks_simd(
tmp: &[f32],
tmp_w: usize,
tmp_h: usize,
block_w: usize,
block_h: usize,
aq_map: &mut [f32],
) {
let max_x = tmp_w.saturating_sub(1);
let max_y = tmp_h.saturating_sub(1);
for by in 0..block_h {
let py0 = by * 2;
let py1 = (py0 + 1).min(max_y);
let row0 = &tmp[py0 * tmp_w..];
let row1 = &tmp[py1 * tmp_w..];
let out_row = &mut aq_map[by * block_w..];
for bx in 0..block_w {
let px0 = bx * 2;
let px1 = (px0 + 1).min(max_x);
out_row[bx] = row0[px0] + row0[px1] + row1[px0] + row1[px1];
}
}
}
#[cfg(target_arch = "x86_64")]
pub(crate) mod archmage_impl {
use archmage::{SimdToken, X64V3Token, X64V4Token, arcane, rite};
use core::arch::x86_64::*;
#[allow(unused_imports)]
use safe_unaligned_simd::x86_64::{
_mm256_loadu_ps, _mm256_storeu_ps, _mm512_loadu_ps, _mm512_storeu_ps,
};
use super::{
GAMMA_OFFSET, K_DEN_MUL_RATIO, K_MASKING_LOG_OFFSET, K_MASKING_MUL, K_NUM_MUL_RATIO,
K_NUM_OFFSET_RATIO, K_VOFFSET_RATIO, LIMIT,
};
#[rite]
fn mage_ratio_of_derivatives_x8(_token: X64V3Token, vals: __m256) -> __m256 {
let zero = _mm256_setzero_ps();
let v = _mm256_max_ps(vals, zero);
let v2 = _mm256_mul_ps(v, v);
let k_num_mul = _mm256_set1_ps(K_NUM_MUL_RATIO);
let k_num_off = _mm256_set1_ps(K_NUM_OFFSET_RATIO);
let num = _mm256_fmadd_ps(v2, k_num_mul, k_num_off);
let k_den_mul = _mm256_set1_ps(K_DEN_MUL_RATIO);
let k_voff = _mm256_set1_ps(K_VOFFSET_RATIO);
let v_scaled = _mm256_mul_ps(v, k_den_mul);
let den = _mm256_fmadd_ps(v_scaled, v2, k_voff);
_mm256_div_ps(den, num)
}
#[rite]
fn mage_masking_sqrt_x8(_token: X64V3Token, v: __m256) -> __m256 {
let k_mul_sqrt = _mm256_set1_ps((K_MASKING_MUL * 1e8_f32).sqrt());
let k_offset = _mm256_set1_ps(K_MASKING_LOG_OFFSET);
let quarter = _mm256_set1_ps(0.25);
let inner = _mm256_fmadd_ps(v, k_mul_sqrt, k_offset);
let sqrt_inner = _mm256_sqrt_ps(inner);
_mm256_mul_ps(quarter, sqrt_inner)
}
#[rite]
fn mage_pre_erosion_pixel_x8(
token: X64V3Token,
pixels: __m256,
left: __m256,
right: __m256,
top: __m256,
bottom: __m256,
) -> __m256 {
let quarter = _mm256_set1_ps(0.25);
let gamma_offset = _mm256_set1_ps(GAMMA_OFFSET);
let limit = _mm256_set1_ps(LIMIT);
let sum_lr = _mm256_add_ps(left, right);
let sum_tb = _mm256_add_ps(top, bottom);
let sum_all = _mm256_add_ps(sum_lr, sum_tb);
let base = _mm256_mul_ps(quarter, sum_all);
let pixel_gamma = _mm256_add_ps(pixels, gamma_offset);
let ratio = mage_ratio_of_derivatives_x8(token, pixel_gamma);
let pixel_minus_base = _mm256_sub_ps(pixels, base);
let diff = _mm256_mul_ps(ratio, pixel_minus_base);
let diff_sq = _mm256_mul_ps(diff, diff);
let diff_sq_clamped = _mm256_min_ps(diff_sq, limit);
mage_masking_sqrt_x8(token, diff_sq_clamped)
}
#[arcane]
#[inline]
fn mage_pre_erosion_row_padded_inner(
_token: X64V3Token,
row: &[f32],
row_above: &[f32],
row_below: &[f32],
output: &mut [f32],
width: usize,
) {
use safe_unaligned_simd::x86_64 as safe_simd;
let chunks = width / 8;
let quarter = _mm256_set1_ps(0.25);
let gamma_offset = _mm256_set1_ps(GAMMA_OFFSET);
let limit = _mm256_set1_ps(LIMIT);
let k_num_mul = _mm256_set1_ps(K_NUM_MUL_RATIO);
let k_num_off = _mm256_set1_ps(K_NUM_OFFSET_RATIO);
let k_den_mul = _mm256_set1_ps(K_DEN_MUL_RATIO);
let k_voff = _mm256_set1_ps(K_VOFFSET_RATIO);
let k_mul_sqrt = _mm256_set1_ps((K_MASKING_MUL * 1e8_f32).sqrt());
let k_offset = _mm256_set1_ps(K_MASKING_LOG_OFFSET);
let zero = _mm256_setzero_ps();
for chunk in 0..chunks {
let x = chunk * 8;
let buf_x = x + 1;
let pixels =
safe_simd::_mm256_loadu_ps(<&[f32; 8]>::try_from(&row[buf_x..buf_x + 8]).unwrap());
let left = safe_simd::_mm256_loadu_ps(
<&[f32; 8]>::try_from(&row[buf_x - 1..buf_x + 7]).unwrap(),
);
let right = safe_simd::_mm256_loadu_ps(
<&[f32; 8]>::try_from(&row[buf_x + 1..buf_x + 9]).unwrap(),
);
let top = safe_simd::_mm256_loadu_ps(
<&[f32; 8]>::try_from(&row_above[buf_x..buf_x + 8]).unwrap(),
);
let bottom = safe_simd::_mm256_loadu_ps(
<&[f32; 8]>::try_from(&row_below[buf_x..buf_x + 8]).unwrap(),
);
let sum_lr = _mm256_add_ps(left, right);
let sum_tb = _mm256_add_ps(top, bottom);
let sum_all = _mm256_add_ps(sum_lr, sum_tb);
let base = _mm256_mul_ps(quarter, sum_all);
let pixel_gamma = _mm256_add_ps(pixels, gamma_offset);
let v = _mm256_max_ps(pixel_gamma, zero);
let v2 = _mm256_mul_ps(v, v);
let num = _mm256_fmadd_ps(v2, k_num_mul, k_num_off);
let v_scaled = _mm256_mul_ps(v, k_den_mul);
let den = _mm256_fmadd_ps(v_scaled, v2, k_voff);
let ratio = _mm256_div_ps(den, num);
let pixel_minus_base = _mm256_sub_ps(pixels, base);
let diff = _mm256_mul_ps(ratio, pixel_minus_base);
let diff_sq = _mm256_mul_ps(diff, diff);
let diff_sq_clamped = _mm256_min_ps(diff_sq, limit);
let inner = _mm256_fmadd_ps(diff_sq_clamped, k_mul_sqrt, k_offset);
let sqrt_inner = _mm256_sqrt_ps(inner);
let result = _mm256_mul_ps(quarter, sqrt_inner);
let existing =
safe_simd::_mm256_loadu_ps(<&[f32; 8]>::try_from(&output[x..x + 8]).unwrap());
let updated = _mm256_add_ps(existing, result);
safe_simd::_mm256_storeu_ps(
<&mut [f32; 8]>::try_from(&mut output[x..x + 8]).unwrap(),
updated,
);
}
}
pub fn mage_pre_erosion_row_padded(
token: X64V3Token,
row: &[f32],
row_above: &[f32],
row_below: &[f32],
width: usize,
output: &mut [f32],
) {
debug_assert_eq!(row.len(), width + 2);
debug_assert_eq!(row_above.len(), width + 2);
debug_assert_eq!(row_below.len(), width + 2);
debug_assert_eq!(output.len(), width);
if width == 0 {
return;
}
mage_pre_erosion_row_padded_inner(token, row, row_above, row_below, output, width);
let chunks = width / 8;
for x in (chunks * 8)..width {
let buf_x = x + 1;
let pixel = row[buf_x];
let left_val = row[buf_x - 1];
let right_val = row[buf_x + 1];
let top_val = row_above[buf_x];
let bottom_val = row_below[buf_x];
let base = 0.25 * (left_val + right_val + top_val + bottom_val);
let ratio = super::ratio_of_derivatives_scalar(pixel + GAMMA_OFFSET, false);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
let masked = super::masking_sqrt_scalar(diff_sq);
output[x] += masked;
}
}
#[arcane]
#[inline]
fn mage_pre_erosion_row_padded_inner_v4(
_token: X64V4Token,
row: &[f32],
row_above: &[f32],
row_below: &[f32],
output: &mut [f32],
width: usize,
) {
let chunks16 = width / 16;
let quarter = _mm512_set1_ps(0.25);
let gamma_offset = _mm512_set1_ps(GAMMA_OFFSET);
let limit = _mm512_set1_ps(LIMIT);
let k_num_mul = _mm512_set1_ps(K_NUM_MUL_RATIO);
let k_num_off = _mm512_set1_ps(K_NUM_OFFSET_RATIO);
let k_den_mul = _mm512_set1_ps(K_DEN_MUL_RATIO);
let k_voff = _mm512_set1_ps(K_VOFFSET_RATIO);
let k_mul_sqrt = _mm512_set1_ps((K_MASKING_MUL * 1e8_f32).sqrt());
let k_offset = _mm512_set1_ps(K_MASKING_LOG_OFFSET);
let zero = _mm512_setzero_ps();
for chunk in 0..chunks16 {
let x = chunk * 16;
let buf_x = x + 1;
let pixels = _mm512_loadu_ps(row[buf_x..][..16].try_into().unwrap());
let left = _mm512_loadu_ps(row[buf_x - 1..][..16].try_into().unwrap());
let right = _mm512_loadu_ps(row[buf_x + 1..][..16].try_into().unwrap());
let top = _mm512_loadu_ps(row_above[buf_x..][..16].try_into().unwrap());
let bottom = _mm512_loadu_ps(row_below[buf_x..][..16].try_into().unwrap());
let sum_lr = _mm512_add_ps(left, right);
let sum_tb = _mm512_add_ps(top, bottom);
let sum_all = _mm512_add_ps(sum_lr, sum_tb);
let base = _mm512_mul_ps(quarter, sum_all);
let pixel_gamma = _mm512_add_ps(pixels, gamma_offset);
let v = _mm512_max_ps(pixel_gamma, zero);
let v2 = _mm512_mul_ps(v, v);
let num = _mm512_fmadd_ps(v2, k_num_mul, k_num_off);
let v_scaled = _mm512_mul_ps(v, k_den_mul);
let den = _mm512_fmadd_ps(v_scaled, v2, k_voff);
let ratio = _mm512_div_ps(den, num);
let pixel_minus_base = _mm512_sub_ps(pixels, base);
let diff = _mm512_mul_ps(ratio, pixel_minus_base);
let diff_sq = _mm512_mul_ps(diff, diff);
let diff_sq_clamped = _mm512_min_ps(diff_sq, limit);
let inner = _mm512_fmadd_ps(diff_sq_clamped, k_mul_sqrt, k_offset);
let sqrt_inner = _mm512_sqrt_ps(inner);
let result = _mm512_mul_ps(quarter, sqrt_inner);
let existing = _mm512_loadu_ps(output[x..][..16].try_into().unwrap());
let updated = _mm512_add_ps(existing, result);
_mm512_storeu_ps((&mut output[x..][..16]).try_into().unwrap(), updated);
}
}
#[arcane]
#[inline]
fn mage_pre_erosion_8_inner(
_token: X64V3Token,
row: &[f32],
row_above: &[f32],
row_below: &[f32],
output: &mut [f32],
buf_x: usize,
out_x: usize,
) {
use safe_unaligned_simd::x86_64 as safe_simd;
let quarter = _mm256_set1_ps(0.25);
let gamma_offset = _mm256_set1_ps(GAMMA_OFFSET);
let limit = _mm256_set1_ps(LIMIT);
let k_num_mul = _mm256_set1_ps(K_NUM_MUL_RATIO);
let k_num_off = _mm256_set1_ps(K_NUM_OFFSET_RATIO);
let k_den_mul = _mm256_set1_ps(K_DEN_MUL_RATIO);
let k_voff = _mm256_set1_ps(K_VOFFSET_RATIO);
let k_mul_sqrt = _mm256_set1_ps((K_MASKING_MUL * 1e8_f32).sqrt());
let k_offset = _mm256_set1_ps(K_MASKING_LOG_OFFSET);
let zero = _mm256_setzero_ps();
let pixels =
safe_simd::_mm256_loadu_ps(<&[f32; 8]>::try_from(&row[buf_x..buf_x + 8]).unwrap());
let left =
safe_simd::_mm256_loadu_ps(<&[f32; 8]>::try_from(&row[buf_x - 1..buf_x + 7]).unwrap());
let right =
safe_simd::_mm256_loadu_ps(<&[f32; 8]>::try_from(&row[buf_x + 1..buf_x + 9]).unwrap());
let top = safe_simd::_mm256_loadu_ps(
<&[f32; 8]>::try_from(&row_above[buf_x..buf_x + 8]).unwrap(),
);
let bottom = safe_simd::_mm256_loadu_ps(
<&[f32; 8]>::try_from(&row_below[buf_x..buf_x + 8]).unwrap(),
);
let sum_lr = _mm256_add_ps(left, right);
let sum_tb = _mm256_add_ps(top, bottom);
let sum_all = _mm256_add_ps(sum_lr, sum_tb);
let base = _mm256_mul_ps(quarter, sum_all);
let pixel_gamma = _mm256_add_ps(pixels, gamma_offset);
let v = _mm256_max_ps(pixel_gamma, zero);
let v2 = _mm256_mul_ps(v, v);
let num = _mm256_fmadd_ps(v2, k_num_mul, k_num_off);
let v_scaled = _mm256_mul_ps(v, k_den_mul);
let den = _mm256_fmadd_ps(v_scaled, v2, k_voff);
let ratio = _mm256_div_ps(den, num);
let pixel_minus_base = _mm256_sub_ps(pixels, base);
let diff = _mm256_mul_ps(ratio, pixel_minus_base);
let diff_sq = _mm256_mul_ps(diff, diff);
let diff_sq_clamped = _mm256_min_ps(diff_sq, limit);
let inner = _mm256_fmadd_ps(diff_sq_clamped, k_mul_sqrt, k_offset);
let sqrt_inner = _mm256_sqrt_ps(inner);
let result = _mm256_mul_ps(quarter, sqrt_inner);
let existing =
safe_simd::_mm256_loadu_ps(<&[f32; 8]>::try_from(&output[out_x..out_x + 8]).unwrap());
let updated = _mm256_add_ps(existing, result);
safe_simd::_mm256_storeu_ps(
<&mut [f32; 8]>::try_from(&mut output[out_x..out_x + 8]).unwrap(),
updated,
);
}
pub fn mage_pre_erosion_row_padded_v4(
row: &[f32],
row_above: &[f32],
row_below: &[f32],
width: usize,
output: &mut [f32],
) {
debug_assert_eq!(row.len(), width + 2);
debug_assert_eq!(row_above.len(), width + 2);
debug_assert_eq!(row_below.len(), width + 2);
debug_assert_eq!(output.len(), width);
if width == 0 {
return;
}
if let Some(v4_token) = X64V4Token::summon() {
mage_pre_erosion_row_padded_inner_v4(
v4_token, row, row_above, row_below, output, width,
);
let v4_processed = (width / 16) * 16;
if v4_processed < width
&& let Some(v3_token) = X64V3Token::summon()
{
let remaining_chunks8 = (width - v4_processed) / 8;
for chunk in 0..remaining_chunks8 {
let x = v4_processed + chunk * 8;
let buf_x = x + 1;
mage_pre_erosion_8_inner(v3_token, row, row_above, row_below, output, buf_x, x);
}
}
let processed = (width / 8) * 8;
for x in processed..width {
let buf_x = x + 1;
let pixel = row[buf_x];
let left_val = row[buf_x - 1];
let right_val = row[buf_x + 1];
let top_val = row_above[buf_x];
let bottom_val = row_below[buf_x];
let base = 0.25 * (left_val + right_val + top_val + bottom_val);
let ratio = super::ratio_of_derivatives_scalar(pixel + GAMMA_OFFSET, false);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
let masked = super::masking_sqrt_scalar(diff_sq);
output[x] += masked;
}
} else if let Some(token) = X64V3Token::summon() {
mage_pre_erosion_row_padded(token, row, row_above, row_below, width, output);
} else {
for x in 0..width {
let buf_x = x + 1;
let pixel = row[buf_x];
let left_val = row[buf_x - 1];
let right_val = row[buf_x + 1];
let top_val = row_above[buf_x];
let bottom_val = row_below[buf_x];
let base = 0.25 * (left_val + right_val + top_val + bottom_val);
let ratio = super::ratio_of_derivatives_scalar(pixel + GAMMA_OFFSET, false);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
let masked = super::masking_sqrt_scalar(diff_sq);
output[x] += masked;
}
}
}
const K_INPUT_SCALING: f32 = 1.0 / 255.0;
const K_INV_LOG2E: f32 = 0.6931471805599453;
const K_BIAS: f32 = 0.16 / K_INPUT_SCALING;
const K_SUM_COEFF: f32 = -2.0052193233688884 * K_INPUT_SCALING / 112.0;
const K_GAMMA: f32 = -0.15526878023684174 * K_INV_LOG2E;
const K_SCALE: f32 = K_INPUT_SCALING / 64.0;
const LOG2_E: f32 = 1.442695041;
const K_MASK_BASE: f32 = -0.74174993;
const K_MASK_MUL4: f32 = 3.2353257320940401;
const K_MASK_MUL2: f32 = 12.906028311180409;
const K_MASK_OFFSET2: f32 = 305.04035728311436;
const K_MASK_MUL3: f32 = 5.0220313103171232;
const K_MASK_OFFSET3: f32 = 2.1925739705298404;
const K_MASK_OFFSET4: f32 = 0.25 * K_MASK_OFFSET3;
const K_MASK_MUL0: f32 = 0.74760422233706747;
#[rite]
fn hsum_ps(_token: X64V3Token, v: __m256) -> f32 {
let sum1 = _mm256_hadd_ps(v, v); let sum2 = _mm256_hadd_ps(sum1, sum1); let hi = _mm256_extractf128_ps(sum2, 1);
let lo = _mm256_castps256_ps128(sum2);
let sum3 = _mm_add_ps(lo, hi);
_mm_cvtss_f32(sum3)
}
#[rite]
fn mage_ratio_of_derivatives_inv_x8(
_token: X64V3Token,
vals: __m256,
zero: __m256,
k_num_mul: __m256,
k_num_off: __m256,
k_den_mul: __m256,
k_voff: __m256,
) -> __m256 {
let v = _mm256_max_ps(vals, zero);
let v2 = _mm256_mul_ps(v, v);
let num = _mm256_fmadd_ps(v2, k_num_mul, k_num_off);
let v_scaled = _mm256_mul_ps(v, k_den_mul);
let den = _mm256_fmadd_ps(v_scaled, v2, k_voff);
_mm256_div_ps(num, den)
}
#[rite]
fn mage_hf_gamma_sum_8x8(
token: X64V3Token,
block: &[f32],
block_offset: usize,
stride: usize,
zero: __m256,
bias: __m256,
mask_first_7: __m256,
k_num_mul: __m256,
k_num_off: __m256,
k_den_mul: __m256,
k_voff: __m256,
) -> (f32, f32) {
let mut hf_acc = _mm256_setzero_ps();
let mut gamma_acc = _mm256_setzero_ps();
for dy in 0..8usize {
let row_start = block_offset + dy * stride;
let row = _mm256_loadu_ps(block[row_start..][..8].try_into().unwrap());
let row_right = _mm256_loadu_ps(block[row_start + 1..][..8].try_into().unwrap());
let h_diff = _mm256_sub_ps(row, row_right);
let h_abs = _mm256_andnot_ps(_mm256_set1_ps(-0.0), h_diff); let h_masked = _mm256_mul_ps(h_abs, mask_first_7);
hf_acc = _mm256_add_ps(hf_acc, h_masked);
if dy < 7 {
let next_start = block_offset + (dy + 1) * stride;
let next_row = _mm256_loadu_ps(block[next_start..][..8].try_into().unwrap());
let v_diff = _mm256_sub_ps(row, next_row);
let v_abs = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v_diff);
hf_acc = _mm256_add_ps(hf_acc, v_abs);
}
let row_biased = _mm256_add_ps(row, bias);
let gamma_val = mage_ratio_of_derivatives_inv_x8(
token, row_biased, zero, k_num_mul, k_num_off, k_den_mul, k_voff,
);
gamma_acc = _mm256_add_ps(gamma_acc, gamma_val);
}
let hf_sum = hsum_ps(token, hf_acc);
let gamma_sum = hsum_ps(token, gamma_acc);
(hf_sum, gamma_sum)
}
#[inline(always)]
fn mage_fast_log2(x: f32) -> f32 {
if x <= 0.0 {
return f32::NEG_INFINITY;
}
let bits = x.to_bits() as i32;
let e = (bits >> 23) - 127;
let f = f32::from_bits((bits & 0x007FFFFF) as u32 | 0x3F800000);
let t = f - 1.0;
let log2_f = t * 0.2885390082_f32
.mul_add(t, -0.3606737602)
.mul_add(t, 0.4808983470)
.mul_add(t, -0.7213475204)
.mul_add(t, 1.442695041);
e as f32 + log2_f
}
#[inline(always)]
fn mage_fast_exp2(x: f32) -> f32 {
let x = x.clamp(-126.0, 127.0);
let xi = x.floor();
let xf = x - xi;
let p = 1.0
+ xf * (0.6931471805599453
+ xf * (0.24022650695910071
+ xf * (0.055504108664821579 + xf * 0.009618129107628477)));
let pow2_xi = f32::from_bits(((xi as i32 + 127) as u32) << 23);
p * pow2_xi
}
#[arcane]
pub fn mage_per_block_modulations_row(
token: X64V3Token,
input: &[f32],
stride: usize,
by: usize,
block_w: usize,
aq_row: &mut [f32],
mul: f32,
add: f32,
) {
if block_w == 0 {
return;
}
let y_start = by * 8;
let zero = _mm256_setzero_ps();
let bias = _mm256_set1_ps(K_BIAS);
let mask_first_7 = _mm256_set_ps(0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
let k_num_mul = _mm256_set1_ps(K_NUM_MUL_RATIO);
let k_num_off = _mm256_set1_ps(K_NUM_OFFSET_RATIO);
let k_den_mul = _mm256_set1_ps(K_DEN_MUL_RATIO);
let k_voff = _mm256_set1_ps(K_VOFFSET_RATIO);
for bx in 0..block_w {
let x_start = bx * 8;
let fuzzy_val = aq_row[bx];
let v1 = (fuzzy_val * K_MASK_MUL0).max(1e-3);
let v1_sq = v1 * v1;
let v2 = 1.0 / (v1 + K_MASK_OFFSET2);
let v3 = 1.0 / (v1_sq + K_MASK_OFFSET3);
let v4 = 1.0 / (v1_sq + K_MASK_OFFSET4);
let mut out_val = K_MASK_BASE + K_MASK_MUL4 * v4 + K_MASK_MUL2 * v2 + K_MASK_MUL3 * v3;
let block_offset = y_start * stride + x_start;
let (hf_sum, gamma_sum) = mage_hf_gamma_sum_8x8(
token,
input,
block_offset,
stride,
zero,
bias,
mask_first_7,
k_num_mul,
k_num_off,
k_den_mul,
k_voff,
);
out_val += hf_sum * K_SUM_COEFF;
let overall_ratio = gamma_sum * K_SCALE;
if overall_ratio > 0.0 {
let log_ratio = mage_fast_log2(overall_ratio);
out_val += K_GAMMA * log_ratio;
}
let quant_field = mage_fast_exp2(out_val * LOG2_E) * mul + add;
aq_row[bx] = quant_field;
}
}
pub fn mage_compute_fuzzy_erosion_row(
pre_erosion_buffer: &[f32],
pe_w: usize,
buffer_rows: usize,
max_filled_row: usize,
pe_y_base: usize,
block_start: usize,
block_count: usize,
output: &mut [f32],
) {
if block_count == 0 || pe_w == 0 {
return;
}
let pe_w_i = pe_w as isize;
let max_y = max_filled_row as isize;
let cy0 = pe_y_base as isize;
let _cy1 = cy0 + 1;
let r0 = ((cy0 - 1).clamp(0, max_y) as usize % buffer_rows) * pe_w; let r1 = (cy0.clamp(0, max_y) as usize % buffer_rows) * pe_w; let r2 = ((cy0 + 1).clamp(0, max_y) as usize % buffer_rows) * pe_w; let r3 = ((cy0 + 2).clamp(0, max_y) as usize % buffer_rows) * pe_w;
let y_interior = cy0 >= 1 && cy0 + 2 <= max_y;
for bx_offset in 0..block_count {
let bx = block_start + bx_offset;
let pe_x = (bx_offset * 2) as isize;
let x_interior = pe_x >= 1 && pe_x + 2 < pe_w_i - 1;
let block_sum = if y_interior && x_interior {
let cx0 = pe_x as usize;
let cx1 = cx0 + 1;
let mut sum = 0.0f32;
let v0 = pre_erosion_buffer[r0 + cx0 - 1];
let v1 = pre_erosion_buffer[r0 + cx0];
let v2 = pre_erosion_buffer[r0 + cx0 + 1];
let v3 = pre_erosion_buffer[r1 + cx0 - 1];
let v4 = pre_erosion_buffer[r1 + cx0];
let v5 = pre_erosion_buffer[r1 + cx0 + 1];
let v6 = pre_erosion_buffer[r2 + cx0 - 1];
let v7 = pre_erosion_buffer[r2 + cx0];
let v8 = pre_erosion_buffer[r2 + cx0 + 1];
let mut a = [v0, v1, v2, v3, v4, v5, v6, v7, v8];
let mut mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.125 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.075 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.06 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.05 * a[mi];
let v0 = pre_erosion_buffer[r0 + cx1 - 1];
let v1 = pre_erosion_buffer[r0 + cx1];
let v2 = pre_erosion_buffer[r0 + cx1 + 1];
let v3 = pre_erosion_buffer[r1 + cx1 - 1];
let v4 = pre_erosion_buffer[r1 + cx1];
let v5 = pre_erosion_buffer[r1 + cx1 + 1];
let v6 = pre_erosion_buffer[r2 + cx1 - 1];
let v7 = pre_erosion_buffer[r2 + cx1];
let v8 = pre_erosion_buffer[r2 + cx1 + 1];
let mut a = [v0, v1, v2, v3, v4, v5, v6, v7, v8];
let mut mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.125 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.075 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.06 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.05 * a[mi];
let v0 = pre_erosion_buffer[r1 + cx0 - 1];
let v1 = pre_erosion_buffer[r1 + cx0];
let v2 = pre_erosion_buffer[r1 + cx0 + 1];
let v3 = pre_erosion_buffer[r2 + cx0 - 1];
let v4 = pre_erosion_buffer[r2 + cx0];
let v5 = pre_erosion_buffer[r2 + cx0 + 1];
let v6 = pre_erosion_buffer[r3 + cx0 - 1];
let v7 = pre_erosion_buffer[r3 + cx0];
let v8 = pre_erosion_buffer[r3 + cx0 + 1];
let mut a = [v0, v1, v2, v3, v4, v5, v6, v7, v8];
let mut mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.125 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.075 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.06 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.05 * a[mi];
let v0 = pre_erosion_buffer[r1 + cx1 - 1];
let v1 = pre_erosion_buffer[r1 + cx1];
let v2 = pre_erosion_buffer[r1 + cx1 + 1];
let v3 = pre_erosion_buffer[r2 + cx1 - 1];
let v4 = pre_erosion_buffer[r2 + cx1];
let v5 = pre_erosion_buffer[r2 + cx1 + 1];
let v6 = pre_erosion_buffer[r3 + cx1 - 1];
let v7 = pre_erosion_buffer[r3 + cx1];
let v8 = pre_erosion_buffer[r3 + cx1 + 1];
let mut a = [v0, v1, v2, v3, v4, v5, v6, v7, v8];
let mut mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.125 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.075 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.06 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.05 * a[mi];
sum
} else {
let mut sum = 0.0f32;
for dy in 0..2isize {
for dx in 0..2isize {
let cx = pe_x + dx;
let cy = cy0 + dy;
let c0 = (cx - 1).clamp(0, pe_w_i - 1) as usize;
let c1 = cx.clamp(0, pe_w_i - 1) as usize;
let c2 = (cx + 1).clamp(0, pe_w_i - 1) as usize;
let ry0 = ((cy - 1).clamp(0, max_y) as usize % buffer_rows) * pe_w;
let ry1 = (cy.clamp(0, max_y) as usize % buffer_rows) * pe_w;
let ry2 = ((cy + 1).clamp(0, max_y) as usize % buffer_rows) * pe_w;
let v0 = pre_erosion_buffer[ry0 + c0];
let v1 = pre_erosion_buffer[ry0 + c1];
let v2 = pre_erosion_buffer[ry0 + c2];
let v3 = pre_erosion_buffer[ry1 + c0];
let v4 = pre_erosion_buffer[ry1 + c1];
let v5 = pre_erosion_buffer[ry1 + c2];
let v6 = pre_erosion_buffer[ry2 + c0];
let v7 = pre_erosion_buffer[ry2 + c1];
let v8 = pre_erosion_buffer[ry2 + c2];
let mut a = [v0, v1, v2, v3, v4, v5, v6, v7, v8];
let mut mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.125 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.075 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.06 * a[mi];
a[mi] = f32::MAX;
mi = 0;
if a[1] < a[mi] {
mi = 1;
}
if a[2] < a[mi] {
mi = 2;
}
if a[3] < a[mi] {
mi = 3;
}
if a[4] < a[mi] {
mi = 4;
}
if a[5] < a[mi] {
mi = 5;
}
if a[6] < a[mi] {
mi = 6;
}
if a[7] < a[mi] {
mi = 7;
}
if a[8] < a[mi] {
mi = 8;
}
sum += 0.05 * a[mi];
}
}
sum
};
output[bx] = block_sum;
}
}
}
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
pub use archmage_impl::mage_pre_erosion_row_padded;
#[cfg(target_arch = "x86_64")]
pub use archmage_impl::mage_pre_erosion_row_padded_v4;
#[cfg(target_arch = "x86_64")]
pub use archmage_impl::mage_per_block_modulations_row;
#[rustfmt::skip]
pub const TEST_INPUTS_RATIO: [f32; 16] = [
133.845, 134.845, 135.845, 136.845, 137.845, 138.845, 139.845, 140.845,
45.845, 78.845, 112.845, 156.845, 189.845, 212.845, 234.845, 248.845,
];
#[cfg(test)]
fn test_ratio_of_derivatives_x8(input: [f32; 8]) -> [f32; 8] {
fn inner(input: [f32; 8]) -> [f32; 8] {
incant!(test_ratio_of_derivatives_x8_impl(input))
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn test_ratio_of_derivatives_x8_impl(token: Token, input: [f32; 8]) -> [f32; 8] {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
let v = f32x8::from_array(token, input);
ratio_of_derivatives_x8_generic(token, v).to_array()
}
inner(input)
}
#[cfg(test)]
fn test_ratio_of_derivatives_inv_x8(input: [f32; 8]) -> [f32; 8] {
fn inner(input: [f32; 8]) -> [f32; 8] {
incant!(test_ratio_of_derivatives_inv_x8_impl(input))
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn test_ratio_of_derivatives_inv_x8_impl(token: Token, input: [f32; 8]) -> [f32; 8] {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
let v = f32x8::from_array(token, input);
ratio_of_derivatives_inv_x8_generic(token, v).to_array()
}
inner(input)
}
#[cfg(test)]
fn test_masking_sqrt_x8(input: [f32; 8]) -> [f32; 8] {
fn inner(input: [f32; 8]) -> [f32; 8] {
incant!(test_masking_sqrt_x8_impl(input))
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn test_masking_sqrt_x8_impl(token: Token, input: [f32; 8]) -> [f32; 8] {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
let v = f32x8::from_array(token, input);
masking_sqrt_x8_generic(token, v).to_array()
}
inner(input)
}
#[cfg(test)]
fn test_pre_erosion_pixel_x8(
pixels: [f32; 8],
left: [f32; 8],
right: [f32; 8],
top: [f32; 8],
bottom: [f32; 8],
) -> [f32; 8] {
fn inner(
pixels: [f32; 8],
left: [f32; 8],
right: [f32; 8],
top: [f32; 8],
bottom: [f32; 8],
) -> [f32; 8] {
incant!(test_pre_erosion_pixel_x8_impl(
pixels, left, right, top, bottom
))
}
#[magetypes(v3, neon, wasm128, scalar)]
#[inline(always)]
fn test_pre_erosion_pixel_x8_impl(
token: Token,
pixels: [f32; 8],
left: [f32; 8],
right: [f32; 8],
top: [f32; 8],
bottom: [f32; 8],
) -> [f32; 8] {
#[allow(non_camel_case_types)]
type f32x8 = GenericF32x8<Token>;
pre_erosion_pixel_x8_generic(
token,
f32x8::from_array(token, pixels),
f32x8::from_array(token, left),
f32x8::from_array(token, right),
f32x8::from_array(token, top),
f32x8::from_array(token, bottom),
)
.to_array()
}
inner(pixels, left, right, top, bottom)
}
#[cfg(test)]
mod tests {
use super::*;
const EPSILON_REL: f32 = 1e-6;
fn approx_eq(a: f32, b: f32, rel_tol: f32) -> bool {
let diff = (a - b).abs();
let max_abs = a.abs().max(b.abs());
if max_abs == 0.0 {
diff == 0.0
} else {
diff / max_abs <= rel_tol
}
}
#[test]
fn test_ratio_of_derivatives_x8_matches_scalar() {
let input_arr: [f32; 8] = [
TEST_INPUTS_RATIO[0],
TEST_INPUTS_RATIO[1],
TEST_INPUTS_RATIO[2],
TEST_INPUTS_RATIO[3],
TEST_INPUTS_RATIO[4],
TEST_INPUTS_RATIO[5],
TEST_INPUTS_RATIO[6],
TEST_INPUTS_RATIO[7],
];
let simd_arr = test_ratio_of_derivatives_x8(input_arr);
for i in 0..8 {
let scalar_result = ratio_of_derivatives_scalar(TEST_INPUTS_RATIO[i], false);
assert!(
approx_eq(simd_arr[i], scalar_result, EPSILON_REL),
"Mismatch at index {}: SIMD={}, scalar={}, rel_diff={}",
i,
simd_arr[i],
scalar_result,
(simd_arr[i] - scalar_result).abs() / scalar_result.abs().max(1e-10)
);
}
}
#[test]
fn test_ratio_of_derivatives_inv_x8_matches_scalar() {
let input_arr: [f32; 8] = [
TEST_INPUTS_RATIO[8],
TEST_INPUTS_RATIO[9],
TEST_INPUTS_RATIO[10],
TEST_INPUTS_RATIO[11],
TEST_INPUTS_RATIO[12],
TEST_INPUTS_RATIO[13],
TEST_INPUTS_RATIO[14],
TEST_INPUTS_RATIO[15],
];
let simd_arr = test_ratio_of_derivatives_inv_x8(input_arr);
for i in 0..8 {
let scalar_result = ratio_of_derivatives_scalar(TEST_INPUTS_RATIO[8 + i], true);
assert!(
approx_eq(simd_arr[i], scalar_result, EPSILON_REL),
"Mismatch at index {}: SIMD={}, scalar={}, rel_diff={}",
i,
simd_arr[i],
scalar_result,
(simd_arr[i] - scalar_result).abs() / scalar_result.abs().max(1e-10)
);
}
}
#[test]
fn test_masking_sqrt_x8_matches_scalar() {
let input_arr: [f32; 8] = [0.0, 0.01, 0.05, 0.1, 0.15, 0.2, 0.05, 0.08];
let simd_arr = test_masking_sqrt_x8(input_arr);
for i in 0..8 {
let scalar_result = masking_sqrt_scalar(input_arr[i]);
assert!(
approx_eq(simd_arr[i], scalar_result, EPSILON_REL),
"Mismatch at index {}: SIMD={}, scalar={}, rel_diff={}",
i,
simd_arr[i],
scalar_result,
(simd_arr[i] - scalar_result).abs() / scalar_result.abs().max(1e-10)
);
}
}
#[test]
fn test_ratio_edge_cases() {
let input_arr: [f32; 8] = [0.0, -1.0, 0.001, 1.0, 10.0, 100.0, 255.0, 1000.0];
let simd_arr = test_ratio_of_derivatives_x8(input_arr);
for i in 0..8 {
let scalar_result = ratio_of_derivatives_scalar(input_arr[i], false);
assert!(
approx_eq(simd_arr[i], scalar_result, EPSILON_REL),
"Mismatch at index {} (input={}): SIMD={}, scalar={}, rel_diff={}",
i,
input_arr[i],
simd_arr[i],
scalar_result,
(simd_arr[i] - scalar_result).abs() / scalar_result.abs().max(1e-10)
);
}
}
#[test]
fn test_ratio_locked_values() {
let locked_inputs = [128.0_f32, 64.0, 192.0, 255.0];
let locked_outputs_non_inv = [
ratio_of_derivatives_scalar(128.0, false),
ratio_of_derivatives_scalar(64.0, false),
ratio_of_derivatives_scalar(192.0, false),
ratio_of_derivatives_scalar(255.0, false),
];
println!("Locked ratio_of_derivatives outputs (non-inverted):");
for (inp, out) in locked_inputs.iter().zip(locked_outputs_non_inv.iter()) {
println!(" {} -> {}", inp, out);
}
let input_arr: [f32; 8] = [
locked_inputs[0],
locked_inputs[1],
locked_inputs[2],
locked_inputs[3],
0.0,
0.0,
0.0,
0.0,
];
let simd_arr = test_ratio_of_derivatives_x8(input_arr);
for i in 0..4 {
assert!(
approx_eq(simd_arr[i], locked_outputs_non_inv[i], EPSILON_REL),
"LOCKED VALUE CHANGED! input={}, expected={}, got={}, rel_diff={}",
locked_inputs[i],
locked_outputs_non_inv[i],
simd_arr[i],
(simd_arr[i] - locked_outputs_non_inv[i]).abs()
/ locked_outputs_non_inv[i].abs().max(1e-10)
);
}
}
#[test]
fn test_pre_erosion_pixel_x8_matches_scalar() {
let pixels_arr: [f32; 8] = [100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0, 170.0];
let left_arr: [f32; 8] = [95.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0];
let right_arr: [f32; 8] = [110.0, 120.0, 130.0, 140.0, 150.0, 160.0, 170.0, 175.0];
let top_arr: [f32; 8] = [98.0, 108.0, 118.0, 128.0, 138.0, 148.0, 158.0, 168.0];
let bottom_arr: [f32; 8] = [102.0, 112.0, 122.0, 132.0, 142.0, 152.0, 162.0, 172.0];
let simd_arr =
test_pre_erosion_pixel_x8(pixels_arr, left_arr, right_arr, top_arr, bottom_arr);
for i in 0..8 {
let base = 0.25 * (left_arr[i] + right_arr[i] + top_arr[i] + bottom_arr[i]);
let ratio = ratio_of_derivatives_scalar(pixels_arr[i] + GAMMA_OFFSET, false);
let diff = ratio * (pixels_arr[i] - base);
let diff_sq = (diff * diff).min(LIMIT);
let scalar_result = masking_sqrt_scalar(diff_sq);
assert!(
approx_eq(simd_arr[i], scalar_result, EPSILON_REL),
"Mismatch at index {}: SIMD={}, scalar={}, rel_diff={}",
i,
simd_arr[i],
scalar_result,
(simd_arr[i] - scalar_result).abs() / scalar_result.abs().max(1e-10)
);
}
}
#[test]
fn test_pre_erosion_row_matches_scalar() {
let width = 32;
let row: Vec<f32> = (0..width).map(|x| 100.0 + (x as f32) * 5.0).collect();
let row_above: Vec<f32> = (0..width).map(|x| 98.0 + (x as f32) * 5.0).collect();
let row_below: Vec<f32> = (0..width).map(|x| 102.0 + (x as f32) * 5.0).collect();
let mut output_simd = vec![0.0f32; width];
pre_erosion_row(&row, &row_above, &row_below, &mut output_simd);
let mut output_scalar = vec![0.0f32; width];
for x in 0..width {
let pixel = row[x];
let left_val = if x == 0 { row[0] } else { row[x - 1] };
let right_val = if x == width - 1 {
row[width - 1]
} else {
row[x + 1]
};
let top_val = row_above[x];
let bottom_val = row_below[x];
let base = 0.25 * (left_val + right_val + top_val + bottom_val);
let ratio = ratio_of_derivatives_scalar(pixel + GAMMA_OFFSET, false);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
output_scalar[x] = masking_sqrt_scalar(diff_sq);
}
for x in 0..width {
assert!(
approx_eq(output_simd[x], output_scalar[x], EPSILON_REL),
"Row mismatch at x={}: SIMD={}, scalar={}, rel_diff={}",
x,
output_simd[x],
output_scalar[x],
(output_simd[x] - output_scalar[x]).abs() / output_scalar[x].abs().max(1e-10)
);
}
}
#[test]
fn test_pre_erosion_row_odd_width() {
let width = 35; let row: Vec<f32> = (0..width).map(|x| 128.0 + (x as f32)).collect();
let row_above: Vec<f32> = (0..width).map(|x| 126.0 + (x as f32)).collect();
let row_below: Vec<f32> = (0..width).map(|x| 130.0 + (x as f32)).collect();
let mut output_simd = vec![0.0f32; width];
pre_erosion_row(&row, &row_above, &row_below, &mut output_simd);
let mut output_scalar = vec![0.0f32; width];
for x in 0..width {
let pixel = row[x];
let left_val = if x == 0 { row[0] } else { row[x - 1] };
let right_val = if x == width - 1 {
row[width - 1]
} else {
row[x + 1]
};
let top_val = row_above[x];
let bottom_val = row_below[x];
let base = 0.25 * (left_val + right_val + top_val + bottom_val);
let ratio = ratio_of_derivatives_scalar(pixel + GAMMA_OFFSET, false);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
output_scalar[x] = masking_sqrt_scalar(diff_sq);
}
for x in 0..width {
assert!(
approx_eq(output_simd[x], output_scalar[x], EPSILON_REL),
"Odd width mismatch at x={}: SIMD={}, scalar={}, rel_diff={}",
x,
output_simd[x],
output_scalar[x],
(output_simd[x] - output_scalar[x]).abs() / output_scalar[x].abs().max(1e-10)
);
}
}
#[test]
fn test_compute_pre_erosion_simd_matches_scalar() {
let width = 64;
let height = 64;
let input: Vec<f32> = (0..width * height)
.map(|i| {
let x = i % width;
let y = i / width;
100.0 + (x as f32) * 2.0 + (y as f32) * 1.5 + ((x * y) as f32 * 0.1).sin() * 20.0
})
.collect();
let simd_result = compute_pre_erosion_simd(&input, width, height).unwrap();
let pre_erosion_w = (width + 3) / 4;
let pre_erosion_h = (height + 3) / 4;
let mut scalar_result = vec![0.0f32; pre_erosion_w * pre_erosion_h];
let get = |x: isize, y: isize| -> f32 {
let x = x.clamp(0, width as isize - 1) as usize;
let y = y.clamp(0, height as isize - 1) as usize;
input[y * width + x]
};
let mut diff_buffer = vec![0.0f32; width];
for y_block in 0..pre_erosion_h {
diff_buffer.fill(0.0);
for iy in 0..4 {
let y = y_block * 4 + iy;
if y >= height {
continue;
}
for x in 0..width {
let ix = x as isize;
let iy_s = y as isize;
let pixel = get(ix, iy_s);
let base = 0.25
* (get(ix - 1, iy_s)
+ get(ix + 1, iy_s)
+ get(ix, iy_s - 1)
+ get(ix, iy_s + 1));
let ratio = ratio_of_derivatives_scalar(pixel + GAMMA_OFFSET, false);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
diff_buffer[x] += masking_sqrt_scalar(diff_sq);
}
}
for x_block in 0..pre_erosion_w {
let x_start = x_block * 4;
let mut sum = 0.0f32;
for ix in 0..4 {
let x = x_start + ix;
if x < width {
sum += diff_buffer[x];
}
}
scalar_result[y_block * pre_erosion_w + x_block] = sum * 0.25;
}
}
assert_eq!(simd_result.len(), scalar_result.len());
for i in 0..simd_result.len() {
assert!(
approx_eq(simd_result[i], scalar_result[i], EPSILON_REL),
"Pre-erosion mismatch at index {}: SIMD={}, scalar={}, rel_diff={}",
i,
simd_result[i],
scalar_result[i],
(simd_result[i] - scalar_result[i]).abs() / scalar_result[i].abs().max(1e-10)
);
}
}
#[test]
fn test_downsample_4x_sum() {
let input = vec![
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
];
let mut output = vec![0.0f32; 3];
downsample_4x_sum(&input, &mut output);
assert!((output[0] - 2.5).abs() < 1e-6);
assert!((output[1] - 6.5).abs() < 1e-6);
assert!((output[2] - 10.5).abs() < 1e-6);
}
#[test]
fn test_fast_exp2_accuracy() {
let test_values = [
-10.0,
-5.0,
-2.0,
-1.0,
-0.5,
0.0,
0.5,
1.0,
2.0,
5.0,
10.0,
-0.74174993, 0.123,
-0.456,
3.14159,
];
for &x in &test_values {
let fast = fast_exp2(x);
let exact = x.exp2();
let rel_err = ((fast - exact) / exact).abs();
assert!(
rel_err < 5e-4,
"fast_exp2({}) = {} vs exp2 = {}, rel_err = {}",
x,
fast,
exact,
rel_err
);
}
}
#[test]
fn test_fast_log2_accuracy() {
let test_values = [
0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 4.0, 10.0, 0.25, 0.75, 1.5, 3.14159, 0.00025,
0.005,
];
for &x in &test_values {
let fast = fast_log2(x);
let exact = x.log2();
let abs_err = (fast - exact).abs();
assert!(
abs_err < 0.1,
"fast_log2({}) = {} vs log2 = {}, abs_err = {}",
x,
fast,
exact,
abs_err
);
}
}
#[test]
fn test_weighted_min4_of_9() {
let vals = [9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0_f32];
let result = weighted_min4_of_9(vals);
assert!(
(result - 0.655).abs() < 1e-6,
"Expected 0.655, got {}",
result
);
let vals2 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0_f32];
let result2 = weighted_min4_of_9(vals2);
assert!(
(result2 - 0.655).abs() < 1e-6,
"Expected 0.655, got {}",
result2
);
let vals3 = [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0_f32];
let result3 = weighted_min4_of_9(vals3);
assert!(
(result3 - 1.55).abs() < 1e-6,
"Expected 1.55, got {}",
result3
);
}
#[test]
fn test_fuzzy_erosion_simd_matches_scalar() {
let pre_erosion_w = 16;
let pre_erosion_h = 16;
let pre_erosion: Vec<f32> = (0..pre_erosion_w * pre_erosion_h)
.map(|i| {
let x = i % pre_erosion_w;
let y = i / pre_erosion_w;
0.5 + (x as f32) * 0.02 + (y as f32) * 0.015 + ((x * y) as f32 * 0.1).sin() * 0.1
})
.collect();
let block_w = pre_erosion_w / 2;
let block_h = pre_erosion_h / 2;
let mut aq_map_simd = vec![0.0f32; block_w * block_h];
fuzzy_erosion_simd(
&pre_erosion,
pre_erosion_w,
pre_erosion_h,
block_w,
block_h,
&mut aq_map_simd,
)
.unwrap();
let mut aq_map_scalar = vec![0.0f32; block_w * block_h];
fuzzy_erosion_scalar_ref(
&pre_erosion,
pre_erosion_w,
pre_erosion_h,
block_w,
block_h,
&mut aq_map_scalar,
);
for i in 0..aq_map_simd.len() {
assert!(
approx_eq(aq_map_simd[i], aq_map_scalar[i], EPSILON_REL),
"Fuzzy erosion mismatch at index {}: SIMD={}, scalar={}, rel_diff={}",
i,
aq_map_simd[i],
aq_map_scalar[i],
(aq_map_simd[i] - aq_map_scalar[i]).abs() / aq_map_scalar[i].abs().max(1e-10)
);
}
}
fn fuzzy_erosion_scalar_ref(
pre_erosion: &[f32],
pre_erosion_w: usize,
pre_erosion_h: usize,
block_w: usize,
block_h: usize,
aq_map: &mut [f32],
) {
let mut tmp = vec![0.0f32; pre_erosion_w * pre_erosion_h];
let get = |x: isize, y: isize| -> f32 {
let x = x.clamp(0, pre_erosion_w as isize - 1) as usize;
let y = y.clamp(0, pre_erosion_h as isize - 1) as usize;
pre_erosion[y * pre_erosion_w + x]
};
for y in 0..pre_erosion_h {
for x in 0..pre_erosion_w {
let ix = x as isize;
let iy = y as isize;
let mut vals = [
get(ix - 1, iy - 1),
get(ix, iy - 1),
get(ix + 1, iy - 1),
get(ix - 1, iy),
get(ix, iy),
get(ix + 1, iy),
get(ix - 1, iy + 1),
get(ix, iy + 1),
get(ix + 1, iy + 1),
];
for i in 0..4 {
for j in (i + 1)..9 {
if vals[j] < vals[i] {
vals.swap(i, j);
}
}
}
let weighted = FUZZY_MUL0.mul_add(
vals[0],
FUZZY_MUL1.mul_add(vals[1], FUZZY_MUL2.mul_add(vals[2], FUZZY_MUL3 * vals[3])),
);
tmp[y * pre_erosion_w + x] = weighted;
}
}
let get_tmp = |x: usize, y: usize| -> f32 {
let x = x.min(pre_erosion_w.saturating_sub(1));
let y = y.min(pre_erosion_h.saturating_sub(1));
tmp[y * pre_erosion_w + x]
};
for by in 0..block_h {
for bx in 0..block_w {
let px = bx * 2;
let py = by * 2;
let sum = get_tmp(px, py)
+ get_tmp(px + 1, py)
+ get_tmp(px, py + 1)
+ get_tmp(px + 1, py + 1);
aq_map[by * block_w + bx] = sum;
}
}
}
#[test]
fn test_pre_erosion_row_padded_matches_original() {
let width = 35; let row: Vec<f32> = (0..width).map(|x| 128.0 + (x as f32)).collect();
let row_above: Vec<f32> = (0..width).map(|x| 126.0 + (x as f32)).collect();
let row_below: Vec<f32> = (0..width).map(|x| 130.0 + (x as f32)).collect();
let mut row_padded = vec![0.0f32; width + 2];
let mut row_above_padded = vec![0.0f32; width + 2];
let mut row_below_padded = vec![0.0f32; width + 2];
row_padded[1..1 + width].copy_from_slice(&row);
row_padded[0] = row[0];
row_padded[width + 1] = row[width - 1];
row_above_padded[1..1 + width].copy_from_slice(&row_above);
row_above_padded[0] = row_above[0];
row_above_padded[width + 1] = row_above[width - 1];
row_below_padded[1..1 + width].copy_from_slice(&row_below);
row_below_padded[0] = row_below[0];
row_below_padded[width + 1] = row_below[width - 1];
let mut output_original = vec![0.0f32; width];
pre_erosion_row(&row, &row_above, &row_below, &mut output_original);
let mut output_padded = vec![0.0f32; width];
pre_erosion_row_padded(
&row_padded,
&row_above_padded,
&row_below_padded,
width,
&mut output_padded,
);
for x in 0..width {
assert!(
approx_eq(output_padded[x], output_original[x], EPSILON_REL),
"Padded vs original mismatch at x={}: padded={}, original={}, rel_diff={}",
x,
output_padded[x],
output_original[x],
(output_padded[x] - output_original[x]).abs() / output_original[x].abs().max(1e-10)
);
}
}
#[test]
fn test_pre_erosion_row_padded_various_widths() {
for width in [8, 16, 24, 31, 32, 33, 64, 100, 128] {
let row: Vec<f32> = (0..width).map(|x| 100.0 + (x as f32) * 1.5).collect();
let row_above: Vec<f32> = (0..width).map(|x| 98.0 + (x as f32) * 1.5).collect();
let row_below: Vec<f32> = (0..width).map(|x| 102.0 + (x as f32) * 1.5).collect();
let mut row_padded = vec![0.0f32; width + 2];
let mut row_above_padded = vec![0.0f32; width + 2];
let mut row_below_padded = vec![0.0f32; width + 2];
row_padded[1..1 + width].copy_from_slice(&row);
row_padded[0] = row[0];
row_padded[width + 1] = row[width - 1];
row_above_padded[1..1 + width].copy_from_slice(&row_above);
row_above_padded[0] = row_above[0];
row_above_padded[width + 1] = row_above[width - 1];
row_below_padded[1..1 + width].copy_from_slice(&row_below);
row_below_padded[0] = row_below[0];
row_below_padded[width + 1] = row_below[width - 1];
let mut output_original = vec![0.0f32; width];
let mut output_padded = vec![0.0f32; width];
pre_erosion_row(&row, &row_above, &row_below, &mut output_original);
pre_erosion_row_padded(
&row_padded,
&row_above_padded,
&row_below_padded,
width,
&mut output_padded,
);
for x in 0..width {
assert!(
approx_eq(output_padded[x], output_original[x], EPSILON_REL),
"Width {} mismatch at x={}: padded={}, original={}, rel_diff={}",
width,
x,
output_padded[x],
output_original[x],
(output_padded[x] - output_original[x]).abs()
/ output_original[x].abs().max(1e-10)
);
}
}
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_archmage_pre_erosion_matches_wide() {
use archmage::SimdToken;
let Some(token) = archmage::Desktop64::summon() else {
eprintln!("Skipping archmage test - CPU doesn't support AVX2+FMA");
return;
};
for width in [8, 16, 24, 32, 64, 100, 128, 256] {
let row: Vec<f32> = (0..width).map(|x| 100.0 + (x as f32) * 1.5).collect();
let row_above: Vec<f32> = (0..width).map(|x| 98.0 + (x as f32) * 1.5).collect();
let row_below: Vec<f32> = (0..width).map(|x| 102.0 + (x as f32) * 1.5).collect();
let mut row_padded = vec![0.0f32; width + 2];
let mut row_above_padded = vec![0.0f32; width + 2];
let mut row_below_padded = vec![0.0f32; width + 2];
row_padded[1..1 + width].copy_from_slice(&row);
row_padded[0] = row[0];
row_padded[width + 1] = row[width - 1];
row_above_padded[1..1 + width].copy_from_slice(&row_above);
row_above_padded[0] = row_above[0];
row_above_padded[width + 1] = row_above[width - 1];
row_below_padded[1..1 + width].copy_from_slice(&row_below);
row_below_padded[0] = row_below[0];
row_below_padded[width + 1] = row_below[width - 1];
let mut output_wide = vec![0.0f32; width];
pre_erosion_row_padded(
&row_padded,
&row_above_padded,
&row_below_padded,
width,
&mut output_wide,
);
let mut output_archmage = vec![0.0f32; width];
archmage_impl::mage_pre_erosion_row_padded(
token,
&row_padded,
&row_above_padded,
&row_below_padded,
width,
&mut output_archmage,
);
for x in 0..width {
let diff = (output_archmage[x] - output_wide[x]).abs();
let rel_diff = diff / output_wide[x].abs().max(1e-10);
assert!(
rel_diff < 1e-5, "Width {} archmage vs wide mismatch at x={}: archmage={}, wide={}, rel_diff={}",
width,
x,
output_archmage[x],
output_wide[x],
rel_diff
);
}
}
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_archmage_fuzzy_erosion_matches_scalar() {
use archmage::{SimdToken, X64V3Token};
let Some(token) = X64V3Token::summon() else {
println!("AVX2+FMA not available, skipping archmage test");
return;
};
for (pe_w, buffer_rows, blocks_w) in [
(32, 12, 16), (120, 12, 60), (480, 12, 240), ] {
let mut pre_erosion_buffer = vec![0.0f32; pe_w * buffer_rows];
for row in 0..buffer_rows {
for col in 0..pe_w {
pre_erosion_buffer[row * pe_w + col] = ((row + col) % 256) as f32 / 256.0 + 0.1;
}
}
for pe_y_base in [0, 1, 5, 10] {
let max_filled_row = 11;
let mut output_scalar = vec![0.0f32; blocks_w];
{
const MUL0: f32 = 0.125;
const MUL1: f32 = 0.075;
const MUL2: f32 = 0.06;
const MUL3: f32 = 0.05;
for bx in 0..blocks_w {
let pe_x_base = bx * 2;
let pe_y = pe_y_base as isize;
let mut sum = 0.0f32;
for dy in 0..2 {
for dx in 0..2 {
let cx = (pe_x_base + dx) as isize;
let cy = pe_y + dy as isize;
let mut vals = [0.0f32; 9];
for (i, (ny, nx)) in [
(-1, -1),
(-1, 0),
(-1, 1),
(0, -1),
(0, 0),
(0, 1),
(1, -1),
(1, 0),
(1, 1),
]
.iter()
.enumerate()
{
let px = (cx + nx).clamp(0, pe_w as isize - 1) as usize;
let py = (cy + ny).clamp(0, max_filled_row as isize) as usize;
let buffer_row = py % buffer_rows;
let buf_idx = buffer_row * pe_w + px;
vals[i] = pre_erosion_buffer[buf_idx];
}
for i in 0..4 {
for j in (i + 1)..9 {
if vals[j] < vals[i] {
vals.swap(i, j);
}
}
}
sum += MUL0 * vals[0]
+ MUL1 * vals[1]
+ MUL2 * vals[2]
+ MUL3 * vals[3];
}
}
output_scalar[bx] = sum;
}
}
let mut output_archmage = vec![0.0f32; blocks_w];
super::archmage_impl::mage_compute_fuzzy_erosion_row(
&pre_erosion_buffer,
pe_w,
buffer_rows,
max_filled_row,
pe_y_base,
0, blocks_w, &mut output_archmage,
);
let _ = token;
for bx in 0..blocks_w {
let diff = (output_archmage[bx] - output_scalar[bx]).abs();
let rel_diff = diff / output_scalar[bx].abs().max(1e-10);
assert!(
rel_diff < 1e-5,
"pe_w={} pe_y_base={} mismatch at bx={}: archmage={}, scalar={}, rel_diff={}",
pe_w,
pe_y_base,
bx,
output_archmage[bx],
output_scalar[bx],
rel_diff
);
}
}
}
}
}