use crate::hlg;
#[archmage::magetypes(define(f32x8), v4(cfg(avx512)), v3, neon, wasm128, scalar)]
pub(crate) fn apply_matrix_rgb_tier(token: Token, m: &[[f32; 3]; 3], row: &mut [[f32; 3]]) {
let m00 = f32x8::splat(token, m[0][0]);
let m01 = f32x8::splat(token, m[0][1]);
let m02 = f32x8::splat(token, m[0][2]);
let m10 = f32x8::splat(token, m[1][0]);
let m11 = f32x8::splat(token, m[1][1]);
let m12 = f32x8::splat(token, m[1][2]);
let m20 = f32x8::splat(token, m[2][0]);
let m21 = f32x8::splat(token, m[2][1]);
let m22 = f32x8::splat(token, m[2][2]);
let mut iter = row.chunks_exact_mut(8);
for chunk in &mut iter {
let mut ra = [0.0_f32; 8];
let mut ga = [0.0_f32; 8];
let mut ba = [0.0_f32; 8];
for (i, px) in chunk.iter().enumerate() {
ra[i] = px[0];
ga[i] = px[1];
ba[i] = px[2];
}
let r = f32x8::load(token, &ra);
let g = f32x8::load(token, &ga);
let b = f32x8::load(token, &ba);
let nr = (m00 * r + m01 * g + m02 * b).to_array();
let ng = (m10 * r + m11 * g + m12 * b).to_array();
let nb = (m20 * r + m21 * g + m22 * b).to_array();
for (i, px) in chunk.iter_mut().enumerate() {
px[0] = nr[i];
px[1] = ng[i];
px[2] = nb[i];
}
}
for px in iter.into_remainder().iter_mut() {
let r = m[0][0] * px[0] + m[0][1] * px[1] + m[0][2] * px[2];
let g = m[1][0] * px[0] + m[1][1] * px[1] + m[1][2] * px[2];
let b = m[2][0] * px[0] + m[2][1] * px[1] + m[2][2] * px[2];
px[0] = r;
px[1] = g;
px[2] = b;
}
}
#[archmage::magetypes(define(f32x8), v4(cfg(avx512)), v3, neon, wasm128, scalar)]
pub(crate) fn apply_matrix_rgba_tier(token: Token, m: &[[f32; 3]; 3], row: &mut [[f32; 4]]) {
let m00 = f32x8::splat(token, m[0][0]);
let m01 = f32x8::splat(token, m[0][1]);
let m02 = f32x8::splat(token, m[0][2]);
let m10 = f32x8::splat(token, m[1][0]);
let m11 = f32x8::splat(token, m[1][1]);
let m12 = f32x8::splat(token, m[1][2]);
let m20 = f32x8::splat(token, m[2][0]);
let m21 = f32x8::splat(token, m[2][1]);
let m22 = f32x8::splat(token, m[2][2]);
let mut iter = row.chunks_exact_mut(8);
for chunk in &mut iter {
let mut ra = [0.0_f32; 8];
let mut ga = [0.0_f32; 8];
let mut ba = [0.0_f32; 8];
for (i, px) in chunk.iter().enumerate() {
ra[i] = px[0];
ga[i] = px[1];
ba[i] = px[2];
}
let r = f32x8::load(token, &ra);
let g = f32x8::load(token, &ga);
let b = f32x8::load(token, &ba);
let nr = (m00 * r + m01 * g + m02 * b).to_array();
let ng = (m10 * r + m11 * g + m12 * b).to_array();
let nb = (m20 * r + m21 * g + m22 * b).to_array();
for (i, px) in chunk.iter_mut().enumerate() {
px[0] = nr[i];
px[1] = ng[i];
px[2] = nb[i];
}
}
for px in iter.into_remainder().iter_mut() {
let r = m[0][0] * px[0] + m[0][1] * px[1] + m[0][2] * px[2];
let g = m[1][0] * px[0] + m[1][1] * px[1] + m[1][2] * px[2];
let b = m[2][0] * px[0] + m[2][1] * px[1] + m[2][2] * px[2];
px[0] = r;
px[1] = g;
px[2] = b;
}
}
#[archmage::magetypes(define(f32x8), v4(cfg(avx512)), v3, neon, wasm128, scalar)]
pub(crate) fn soft_clip_tier(token: Token, row: &mut [[f32; 3]]) {
let zero = f32x8::zero(token);
let one = f32x8::splat(token, 1.0);
let denom_eps = f32x8::splat(token, f32::EPSILON);
let mut iter = row.chunks_exact_mut(8);
for chunk in &mut iter {
let mut ra = [0.0_f32; 8];
let mut ga = [0.0_f32; 8];
let mut ba = [0.0_f32; 8];
for (i, px) in chunk.iter().enumerate() {
ra[i] = px[0];
ga[i] = px[1];
ba[i] = px[2];
}
let r = f32x8::load(token, &ra).max(zero);
let g = f32x8::load(token, &ga).max(zero);
let b = f32x8::load(token, &ba).max(zero);
let hi = r.max(g).max(b);
let lo = r.min(g).min(b);
let new_hi = hi.min(one);
let new_lo = lo.min(one);
let denom = hi - lo;
let factor = (new_hi - new_lo) / denom.max(denom_eps);
let cr = new_lo + (r - lo) * factor;
let cg = new_lo + (g - lo) * factor;
let cb = new_lo + (b - lo) * factor;
let eq = denom.simd_le(denom_eps);
let cr = f32x8::blend(eq, new_hi, cr);
let cg = f32x8::blend(eq, new_hi, cg);
let cb = f32x8::blend(eq, new_hi, cb);
let needs = hi.simd_gt(one);
let or = f32x8::blend(needs, cr, r).to_array();
let og = f32x8::blend(needs, cg, g).to_array();
let ob = f32x8::blend(needs, cb, b).to_array();
for (i, px) in chunk.iter_mut().enumerate() {
px[0] = or[i];
px[1] = og[i];
px[2] = ob[i];
}
}
for px in iter.into_remainder().iter_mut() {
let out = crate::gamut::soft_clip(*px);
*px = out;
}
}
#[archmage::magetypes(define(f32x8), v4(cfg(avx512)), v3, neon, wasm128, scalar)]
pub(crate) fn is_out_of_gamut_mask_tier(token: Token, row: &[[f32; 3]], out: &mut [f32]) {
let zero = f32x8::zero(token);
let one = f32x8::splat(token, 1.0);
let chunks = row.chunks_exact(8);
let row_tail = chunks.remainder();
let n_full = row.len() - row_tail.len();
let (out_chunked, out_tail) = out.split_at_mut(n_full);
for (chunk, dst) in chunks.zip(out_chunked.chunks_exact_mut(8)) {
let mut ra = [0.0_f32; 8];
let mut ga = [0.0_f32; 8];
let mut ba = [0.0_f32; 8];
for (i, px) in chunk.iter().enumerate() {
ra[i] = px[0];
ga[i] = px[1];
ba[i] = px[2];
}
let r = f32x8::load(token, &ra);
let g = f32x8::load(token, &ga);
let b = f32x8::load(token, &ba);
let any_lt = r.simd_lt(zero) | g.simd_lt(zero) | b.simd_lt(zero);
let any_gt = r.simd_gt(one) | g.simd_gt(one) | b.simd_gt(one);
let mask = any_lt | any_gt;
let result = f32x8::blend(mask, one, zero);
let arr = result.to_array();
let dst8: &mut [f32; 8] = dst.try_into().unwrap();
*dst8 = arr;
}
for (px, dst) in row_tail.iter().zip(out_tail.iter_mut()) {
*dst = if crate::gamut::is_out_of_gamut(*px) {
1.0
} else {
0.0
};
}
}
const LR: f32 = 0.2627;
const LG: f32 = 0.6780;
const LB: f32 = 0.0593;
#[archmage::magetypes(define(f32x8), v3, neon, wasm128, scalar)]
pub(crate) fn hlg_ootf_exact_tier(token: Token, row: &mut [[f32; 3]], k: f32) {
let lr = f32x8::splat(token, LR);
let lg = f32x8::splat(token, LG);
let lb = f32x8::splat(token, LB);
let zero = f32x8::zero(token);
let pos_eps = f32x8::splat(token, f32::MIN_POSITIVE);
let mut iter = row.chunks_exact_mut(8);
for chunk in &mut iter {
let mut ra = [0.0_f32; 8];
let mut ga = [0.0_f32; 8];
let mut ba = [0.0_f32; 8];
for (i, px) in chunk.iter().enumerate() {
ra[i] = px[0];
ga[i] = px[1];
ba[i] = px[2];
}
let r = f32x8::load(token, &ra);
let g = f32x8::load(token, &ga);
let b = f32x8::load(token, &ba);
let y = lr * r + lg * g + lb * b;
let y_safe = y.max(pos_eps);
let scale = y_safe.pow_midp(k);
let nz = y.simd_gt(zero);
let nr = (r * scale).to_array();
let ng = (g * scale).to_array();
let nb = (b * scale).to_array();
let or_arr = f32x8::blend(nz, f32x8::load(token, &nr), zero).to_array();
let og_arr = f32x8::blend(nz, f32x8::load(token, &ng), zero).to_array();
let ob_arr = f32x8::blend(nz, f32x8::load(token, &nb), zero).to_array();
for (i, px) in chunk.iter_mut().enumerate() {
px[0] = or_arr[i];
px[1] = og_arr[i];
px[2] = ob_arr[i];
}
}
let tail = iter.into_remainder();
for px in tail.iter_mut() {
let y = LR * px[0] + LG * px[1] + LB * px[2];
if y <= 0.0 {
*px = [0.0, 0.0, 0.0];
continue;
}
let scale = crate::math::powf(y, k);
px[0] *= scale;
px[1] *= scale;
px[2] *= scale;
}
}
#[archmage::magetypes(define(f32x8), v3, neon, wasm128, scalar)]
pub(crate) fn hlg_ootf_approx_tier(token: Token, row: &mut [[f32; 3]], exponent: f32) {
let mut iter = row.chunks_exact_mut(8);
for chunk in &mut iter {
let mut ra = [0.0_f32; 8];
let mut ga = [0.0_f32; 8];
let mut ba = [0.0_f32; 8];
for (i, px) in chunk.iter().enumerate() {
ra[i] = px[0];
ga[i] = px[1];
ba[i] = px[2];
}
let r = f32x8::load(token, &ra).pow_midp(exponent).to_array();
let g = f32x8::load(token, &ga).pow_midp(exponent).to_array();
let b = f32x8::load(token, &ba).pow_midp(exponent).to_array();
for (i, px) in chunk.iter_mut().enumerate() {
px[0] = r[i];
px[1] = g[i];
px[2] = b[i];
}
}
for px in iter.into_remainder().iter_mut() {
px[0] = crate::math::powf(px[0], exponent);
px[1] = crate::math::powf(px[1], exponent);
px[2] = crate::math::powf(px[2], exponent);
}
let _ = hlg::hlg_ootf_approx;
}