use archmage::autoversion;
const K_INPUT_SCALING: f32 = 1.0 / 255.0;
const K_EPSILON_RATIO: f32 = 1e-2;
const K_NUM_OFFSET_RATIO: f32 = K_EPSILON_RATIO / K_INPUT_SCALING / K_INPUT_SCALING;
const K_SG_MUL: f32 = 226.0480446705883;
const K_SG_MUL2: f32 = 1.0 / 73.377132366608819;
const K_INV_LOG2E: f32 = 0.6931471805599453;
const K_SG_RET_MUL: f32 = K_SG_MUL2 * 18.6580932135 * K_INV_LOG2E;
const K_NUM_MUL_RATIO: f32 = K_SG_RET_MUL * 3.0 * K_SG_MUL;
const K_SG_VOFFSET: f32 = 7.14672470003;
const K_VOFFSET_RATIO: f32 = (K_SG_VOFFSET * K_INV_LOG2E + K_EPSILON_RATIO) / K_INPUT_SCALING;
const K_DEN_MUL_RATIO: f32 = K_INV_LOG2E * K_SG_MUL * K_INPUT_SCALING * K_INPUT_SCALING;
const K_MASKING_LOG_OFFSET: f32 = 28.0;
const K_MASKING_MUL: f32 = 211.50759899638012;
const K_MASKING_MUL_SQRT: f32 = 145433.00828779556;
const LIMIT: f32 = 0.2;
const MATCH_GAMMA_OFFSET: f32 = 0.019;
const GAMMA_OFFSET: f32 = MATCH_GAMMA_OFFSET / K_INPUT_SCALING;
#[inline(always)]
fn ratio_of_derivatives(val: f32) -> f32 {
let v = val.max(0.0);
let v2 = v * v;
let num = v2.mul_add(K_NUM_MUL_RATIO, K_NUM_OFFSET_RATIO);
let den = (v * K_DEN_MUL_RATIO).mul_add(v2, K_VOFFSET_RATIO);
den / num
}
#[inline(always)]
fn masking_sqrt(v: f32) -> f32 {
0.25 * v.mul_add(K_MASKING_MUL_SQRT, K_MASKING_LOG_OFFSET).sqrt()
}
#[inline(always)]
fn pre_erosion_pixel(pixel: f32, left: f32, right: f32, top: f32, bottom: f32) -> f32 {
let base = 0.25 * (left + right + top + bottom);
let ratio = ratio_of_derivatives(pixel + GAMMA_OFFSET);
let diff = ratio * (pixel - base);
let diff_sq = (diff * diff).min(LIMIT);
masking_sqrt(diff_sq)
}
#[autoversion]
pub fn pre_erosion_row_autovec(
row: &[f32],
row_above: &[f32],
row_below: &[f32],
width: usize,
output: &mut [f32],
) {
debug_assert_eq!(row.len(), width + 2);
debug_assert_eq!(row_above.len(), width + 2);
debug_assert_eq!(row_below.len(), width + 2);
debug_assert_eq!(output.len(), width);
if width == 0 {
return;
}
let chunks = width / 8;
for chunk_idx in 0..chunks {
let x = chunk_idx * 8;
let buf_x = x + 1;
for i in 0..8 {
let pixel = row[buf_x + i];
let left = row[buf_x + i - 1];
let right = row[buf_x + i + 1];
let top = row_above[buf_x + i];
let bottom = row_below[buf_x + i];
output[x + i] += pre_erosion_pixel(pixel, left, right, top, bottom);
}
}
for x in (chunks * 8)..width {
let buf_x = x + 1;
let pixel = row[buf_x];
let left = row[buf_x - 1];
let right = row[buf_x + 1];
let top = row_above[buf_x];
let bottom = row_below[buf_x];
output[x] += pre_erosion_pixel(pixel, left, right, top, bottom);
}
}
#[autoversion]
pub fn pre_erosion_row_autovec_iter(
row: &[f32],
row_above: &[f32],
row_below: &[f32],
width: usize,
output: &mut [f32],
) {
debug_assert_eq!(row.len(), width + 2);
debug_assert_eq!(row_above.len(), width + 2);
debug_assert_eq!(row_below.len(), width + 2);
debug_assert_eq!(output.len(), width);
for x in 0..width {
let buf_x = x + 1;
let pixel = row[buf_x];
let left = row[buf_x - 1];
let right = row[buf_x + 1];
let top = row_above[buf_x];
let bottom = row_below[buf_x];
output[x] += pre_erosion_pixel(pixel, left, right, top, bottom);
}
}
const K_BIAS: f32 = 0.16 / K_INPUT_SCALING;
#[inline(always)]
fn ratio_of_derivatives_inv(val: f32) -> f32 {
let v = val.max(0.0);
let v2 = v * v;
let num = v2.mul_add(K_NUM_MUL_RATIO, K_NUM_OFFSET_RATIO);
let den = (v * K_DEN_MUL_RATIO).mul_add(v2, K_VOFFSET_RATIO);
num / den
}
#[autoversion]
pub fn gamma_modulation_sum_8x8_autovec(
block: &[f32],
stride: usize,
block_y: usize,
img_height: usize,
) -> f32 {
let mut sum = 0.0f32;
for dy in 0..8 {
let y = block_y + dy;
if y >= img_height {
continue;
}
let row_start = dy * stride;
if row_start + 8 > block.len() {
continue;
}
for i in 0..8 {
let pixel = block[row_start + i];
sum += ratio_of_derivatives_inv(pixel + K_BIAS);
}
}
sum
}
#[autoversion]
pub fn hf_modulation_sum_8x8_autovec(
block: &[f32],
stride: usize,
block_y: usize,
img_height: usize,
) -> f32 {
let mut h_sum = 0.0f32;
let mut v_sum = 0.0f32;
for dy in 0..8 {
let y = block_y + dy;
if y >= img_height {
continue;
}
let row_start = dy * stride;
if row_start + 9 <= block.len() {
for i in 0..7 {
let p = block[row_start + i];
let p_right = block[row_start + i + 1];
h_sum += (p - p_right).abs();
}
}
if dy < 7 && y + 1 < img_height {
let next_row_start = (dy + 1) * stride;
if row_start + 8 <= block.len() && next_row_start + 8 <= block.len() {
for i in 0..8 {
let p = block[row_start + i];
let p_below = block[next_row_start + i];
v_sum += (p - p_below).abs();
}
}
}
}
h_sum + v_sum
}
#[inline(always)]
fn fast_log2(x: f32) -> f32 {
if x <= 0.0 {
return f32::NEG_INFINITY;
}
let bits = x.to_bits() as i32;
let e = (bits >> 23) - 127;
let f = f32::from_bits((bits & 0x007FFFFF) as u32 | 0x3F800000);
let f_minus_1 = f - 1.0;
let t = f_minus_1;
let log2_f = t * 0.2885390082_f32
.mul_add(t, -0.3606737602)
.mul_add(t, 0.4808983470)
.mul_add(t, -0.7213475204)
.mul_add(t, 1.442695041);
e as f32 + log2_f
}
#[inline(always)]
fn fast_exp2(x: f32) -> f32 {
let x = x.clamp(-126.0, 127.0);
let xi = x.floor();
let xf = x - xi;
let p = 1.0
+ xf * (0.6931471805599453 + xf * (0.24022650695910071 + xf * (0.055504108664821579 + xf * 0.009618129107628477)));
let xi_i32 = xi as i32;
let bits = ((xi_i32 + 127) as u32) << 23;
f32::from_bits(bits) * p
}
#[autoversion]
pub fn per_block_modulations_row_autovec(
input: &[f32],
stride: usize,
_img_width: usize,
img_height: usize,
by: usize,
block_w: usize,
aq_row: &mut [f32],
mul: f32,
add: f32,
) {
const K_SUM_COEFF: f32 = -2.0052193233688884 * K_INPUT_SCALING / 112.0;
const K_GAMMA: f32 = -0.15526878023684174 * K_INV_LOG2E;
const K_SCALE: f32 = K_INPUT_SCALING / 64.0;
const LOG2_E: f32 = 1.442695041;
const K_MASK_BASE: f32 = -0.74174993;
const K_MASK_MUL4: f32 = 3.2353257320940401;
const K_MASK_MUL2: f32 = 12.906028311180409;
const K_MASK_OFFSET2: f32 = 305.04035728311436;
const K_MASK_MUL3: f32 = 5.0220313103171232;
const K_MASK_OFFSET3: f32 = 2.1925739705298404;
const K_MASK_OFFSET4: f32 = 0.25 * K_MASK_OFFSET3;
const K_MASK_MUL0: f32 = 0.74760422233706747;
let y_start = by * 8;
for bx in 0..block_w {
let x_start = bx * 8;
let fuzzy_val = aq_row[bx];
let v1 = (fuzzy_val * K_MASK_MUL0).max(1e-3);
let v2 = 1.0 / (v1 + K_MASK_OFFSET2);
let v3 = 1.0 / (v1 * v1 + K_MASK_OFFSET3);
let v4 = 1.0 / (v1 * v1 + K_MASK_OFFSET4);
let mut out_val = K_MASK_MUL4.mul_add(
v4,
K_MASK_MUL2.mul_add(v2, K_MASK_MUL3.mul_add(v3, K_MASK_BASE)),
);
let block_offset = y_start * stride + x_start;
let block = &input[block_offset..];
let hf_sum = hf_modulation_sum_8x8_autovec(block, stride, y_start, img_height);
out_val += hf_sum * K_SUM_COEFF;
let gamma_sum = gamma_modulation_sum_8x8_autovec(block, stride, y_start, img_height);
let overall_ratio = gamma_sum * K_SCALE;
let log_ratio = if overall_ratio > 0.0 {
fast_log2(overall_ratio)
} else {
0.0
};
out_val += K_GAMMA * log_ratio;
let quant_field = fast_exp2(out_val * LOG2_E).mul_add(mul, add);
aq_row[bx] = quant_field;
}
}
const FUZZY_MUL0: f32 = 0.125;
const FUZZY_MUL1: f32 = 0.075;
const FUZZY_MUL2: f32 = 0.06;
const FUZZY_MUL3: f32 = 0.05;
#[inline(always)]
fn cas_idx(v: &mut [[f32; 8]; 9], a: usize, b: usize) {
for i in 0..8 {
let min = v[a][i].min(v[b][i]);
let max = v[a][i].max(v[b][i]);
v[a][i] = min;
v[b][i] = max;
}
}
#[inline(always)]
fn weighted_min4_of_9_autovec(mut v: [[f32; 8]; 9]) -> [f32; 8] {
cas_idx(&mut v, 0, 1);
cas_idx(&mut v, 2, 3);
cas_idx(&mut v, 4, 5);
cas_idx(&mut v, 6, 7);
cas_idx(&mut v, 0, 2);
cas_idx(&mut v, 1, 3);
cas_idx(&mut v, 4, 6);
cas_idx(&mut v, 5, 7);
cas_idx(&mut v, 0, 4);
cas_idx(&mut v, 1, 5);
cas_idx(&mut v, 2, 6);
cas_idx(&mut v, 3, 7);
cas_idx(&mut v, 0, 8);
cas_idx(&mut v, 4, 8);
cas_idx(&mut v, 1, 2);
cas_idx(&mut v, 1, 4);
cas_idx(&mut v, 2, 4);
cas_idx(&mut v, 3, 4);
cas_idx(&mut v, 5, 8);
let mut result = [0.0f32; 8];
for i in 0..8 {
result[i] = FUZZY_MUL0 * v[0][i]
+ FUZZY_MUL1 * v[1][i]
+ FUZZY_MUL2 * v[2][i]
+ FUZZY_MUL3 * v[3][i];
}
result
}
#[autoversion]
pub fn compute_fuzzy_erosion_blocks_autovec(
pre_erosion_buffer: &[f32],
pe_w: usize,
buffer_rows: usize,
pe_y_base: isize,
max_filled_row: isize,
start: usize,
end: usize,
out: &mut [f32],
) -> usize {
let max_x = pe_w as isize - 1;
let max_y = max_filled_row.max(0);
let mut processed = 0;
let mut bx = start;
while bx + 8 <= end {
let base_cx: [isize; 8] = std::array::from_fn(|i| ((bx + i - start) * 2) as isize);
let mut sum = [0.0f32; 8];
for dy in 0..2isize {
for dx in 0..2isize {
let cy = pe_y_base + dy;
let cx: [isize; 8] = std::array::from_fn(|i| base_cx[i] + dx);
let mut v: [[f32; 8]; 9] = [[0.0; 8]; 9];
for (neighbor_idx, (ny, nx)) in [
(-1isize, -1isize),
(-1, 0),
(-1, 1),
(0, -1),
(0, 0),
(0, 1),
(1, -1),
(1, 0),
(1, 1),
]
.iter()
.enumerate()
{
let py = (cy + ny).clamp(0, max_y) as usize;
let buffer_row = py % buffer_rows;
let row_offset = buffer_row * pe_w;
for lane in 0..8 {
let px = (cx[lane] + nx).clamp(0, max_x) as usize;
let idx = row_offset + px;
v[neighbor_idx][lane] = if idx < pre_erosion_buffer.len() {
pre_erosion_buffer[idx]
} else {
0.0
};
}
}
let result = weighted_min4_of_9_autovec(v);
for i in 0..8 {
sum[i] += result[i];
}
}
}
out[bx..bx + 8].copy_from_slice(&sum);
bx += 8;
processed += 8;
}
processed
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pre_erosion_row_autovec_matches_reference() {
let width = 64;
let mut row = vec![0.0f32; width + 2];
let mut row_above = vec![0.0f32; width + 2];
let mut row_below = vec![0.0f32; width + 2];
for i in 0..width + 2 {
row[i] = (i as f32 * 3.7) % 255.0;
row_above[i] = (i as f32 * 2.3 + 10.0) % 255.0;
row_below[i] = (i as f32 * 4.1 + 20.0) % 255.0;
}
let mut output1 = vec![0.0f32; width];
let mut output2 = vec![0.0f32; width];
pre_erosion_row_autovec(&row, &row_above, &row_below, width, &mut output1);
pre_erosion_row_autovec_iter(&row, &row_above, &row_below, width, &mut output2);
for i in 0..width {
let diff = (output1[i] - output2[i]).abs();
assert!(
diff < 1e-5,
"Mismatch at {}: {} vs {} (diff {})",
i,
output1[i],
output2[i],
diff
);
}
}
#[test]
fn test_pre_erosion_row_autovec_nonzero() {
let width = 32;
let row = vec![128.0f32; width + 2];
let row_above = vec![100.0f32; width + 2];
let row_below = vec![150.0f32; width + 2];
let mut output = vec![0.0f32; width];
pre_erosion_row_autovec(&row, &row_above, &row_below, width, &mut output);
let sum: f32 = output.iter().sum();
assert!(sum > 0.0, "Output should be non-zero, got sum={}", sum);
}
#[test]
fn test_gamma_modulation_sum_8x8_autovec() {
let stride = 9; let mut block = vec![0.0f32; stride * 8];
for dy in 0..8 {
for dx in 0..8 {
block[dy * stride + dx] = ((dy * 8 + dx) as f32 * 3.7) % 255.0;
}
}
let sum = gamma_modulation_sum_8x8_autovec(&block, stride, 0, 8);
assert!(sum > 0.0, "Gamma sum should be positive, got {}", sum);
assert!(sum.is_finite(), "Gamma sum should be finite");
}
#[test]
fn test_hf_modulation_sum_8x8_autovec() {
let stride = 9;
let mut block = vec![0.0f32; stride * 8];
for dy in 0..8 {
for dx in 0..9 {
block[dy * stride + dx] = (dy * 10 + dx * 5) as f32;
}
}
let sum = hf_modulation_sum_8x8_autovec(&block, stride, 0, 8);
let expected = 840.0;
let diff = (sum - expected).abs();
assert!(
diff < 0.1,
"HF sum mismatch: got {}, expected {}",
sum,
expected
);
}
#[test]
fn test_per_block_modulations_row_autovec() {
let block_w = 8;
let width = block_w * 8;
let height = 8;
let stride = width + 1;
let mut input = vec![128.0f32; stride * height];
for y in 0..height {
for x in 0..width {
input[y * stride + x] = 100.0 + ((x + y) % 50) as f32;
}
}
let mut aq_row = vec![0.5f32; block_w];
let mul = 0.841;
let add = 0.1;
per_block_modulations_row_autovec(
&input,
stride,
width,
height,
0,
block_w,
&mut aq_row,
mul,
add,
);
for (bx, &val) in aq_row.iter().enumerate() {
assert!(
val.is_finite(),
"Block {} has non-finite value: {}",
bx,
val
);
assert!(val > 0.0, "Block {} has non-positive value: {}", bx, val);
}
}
}