use crate::WorkloadStrategy;
use crate::alpha_handle_u8::{premultiply_alpha_rgba_row_impl, unpremultiplication_table};
use crate::wasm32::transpose::{wasm_load_deinterleave_u8x4, wasm_store_interleave_u8x4};
use crate::wasm32::utils::*;
use std::arch::wasm32::*;
pub(crate) fn wasm_unpremultiply_alpha_rgba(in_place: &mut [u8], _: WorkloadStrategy) {
wasm_unpremultiply_alpha_rgba_impl(in_place);
}
#[inline]
#[target_feature(enable = "simd128")]
fn unpremultiply_vec(pixel: v128, alpha: v128) -> v128 {
let scale_back = u8x16_splat(255);
let low_part = u16x8_extmul_low_u8x16(pixel, scale_back);
let high_part = u16x8_extmul_high_u8x16(pixel, scale_back);
let low_alpha_part = u16x8_extend_low_u8x16(alpha);
let high_alpha_part = u16x8_extend_high_u8x16(alpha);
let lo_lo = f32x4_convert_u32x4(u32x4_extend_low_u16x8(low_part));
let lo_hi = f32x4_convert_u32x4(u32x4_extend_high_u16x8(low_part));
let hi_lo = f32x4_convert_u32x4(u32x4_extend_low_u16x8(high_part));
let hi_hi = f32x4_convert_u32x4(u32x4_extend_high_u16x8(high_part));
let lo_lo_alpha = f32x4_convert_u32x4(u32x4_extend_low_u16x8(low_alpha_part));
let lo_hi_alpha = f32x4_convert_u32x4(u32x4_extend_high_u16x8(low_alpha_part));
let hi_lo_alpha = f32x4_convert_u32x4(u32x4_extend_low_u16x8(high_alpha_part));
let hi_hi_alpha = f32x4_convert_u32x4(u32x4_extend_high_u16x8(high_alpha_part));
let lo_lo_0 = u32x4_trunc_sat_f32x4(f32x4_div(lo_lo, lo_lo_alpha));
let lo_hi_0 = u32x4_trunc_sat_f32x4(f32x4_div(lo_hi, lo_hi_alpha));
let hi_lo_0 = u32x4_trunc_sat_f32x4(f32x4_div(hi_lo, hi_lo_alpha));
let hi_hi_0 = u32x4_trunc_sat_f32x4(f32x4_div(hi_hi, hi_hi_alpha));
let packed_lo_16 = u32x4_pack_trunc_u16x8(lo_lo_0, lo_hi_0);
let packed_hi_16 = u32x4_pack_trunc_u16x8(hi_lo_0, hi_hi_0);
u16x8_pack_sat_u8x16(packed_lo_16, packed_hi_16)
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) fn wasm_u16x8_div_by_255(v: v128) -> v128 {
let addition = u16x8_splat(127);
u16x8_shr(u16x8_add(u16x8_add(v, addition), u16x8_shr(v, 8)), 8)
}
#[inline]
#[target_feature(enable = "simd128")]
fn premultiply_vec(pixel: v128, alpha: v128) -> v128 {
let lo_product = u16x8_extmul_low_u8x16(pixel, alpha);
let hi_product = u16x8_extmul_high_u8x16(pixel, alpha);
let lo_packed = wasm_u16x8_div_by_255(lo_product);
let hi_packed = wasm_u16x8_div_by_255(hi_product);
u16x8_pack_sat_u8x16(lo_packed, hi_packed)
}
#[target_feature(enable = "simd128")]
fn wasm_unpremultiply_alpha_rgba_impl(in_place: &mut [u8]) {
let mut rem = in_place;
for dst in rem.chunks_exact_mut(16 * 4) {
let src_ptr = dst.as_ptr();
let mut pixel = wasm_load_deinterleave_u8x4(src_ptr);
pixel.0 = unpremultiply_vec(pixel.0, pixel.3);
pixel.1 = unpremultiply_vec(pixel.1, pixel.3);
pixel.2 = unpremultiply_vec(pixel.2, pixel.3);
let dst_ptr = dst.as_mut_ptr();
wasm_store_interleave_u8x4(dst_ptr, pixel);
}
rem = rem.chunks_exact_mut(16 * 4).into_remainder();
let table = unpremultiplication_table();
for dst in rem.chunks_exact_mut(4) {
let a = dst[3];
let z = a as u16 * 255;
dst[0] = table[(z + dst[0] as u16) as usize];
dst[1] = table[(z + dst[1] as u16) as usize];
dst[2] = table[(z + dst[2] as u16) as usize];
}
}
pub(crate) fn wasm_premultiply_alpha_rgba(dst: &mut [u8], src: &[u8]) {
wasm_premultiply_alpha_rgba_impl(dst, src);
}
#[inline]
#[target_feature(enable = "simd128")]
fn wasm_premultiply_alpha_rgba_impl(dst: &mut [u8], src: &[u8]) {
let mut rem = dst;
let mut src_rem = src;
for (dst, src) in rem
.chunks_exact_mut(16 * 4)
.zip(src_rem.chunks_exact(16 * 4))
{
let mut pixel = wasm_load_deinterleave_u8x4(src.as_ptr());
pixel.0 = premultiply_vec(pixel.0, pixel.3);
pixel.1 = premultiply_vec(pixel.1, pixel.3);
pixel.2 = premultiply_vec(pixel.2, pixel.3);
wasm_store_interleave_u8x4(dst.as_mut_ptr(), pixel);
}
rem = rem.chunks_exact_mut(16 * 4).into_remainder();
src_rem = src_rem.chunks_exact(16 * 4).remainder();
premultiply_alpha_rgba_row_impl(rem, src_rem);
}