use crate::sse::alpha_u8::_mm_select_si128;
use crate::sse::{sse_deinterleave_rgba_epi16, sse_interleave_rgba_epi16};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[inline(always)]
fn sse_unpremultiply_row_u16(
x: __m128i,
is_zero_mask: __m128i,
a_lo_f: __m128,
a_hi_f: __m128,
) -> __m128i {
unsafe {
let zeros = _mm_setzero_si128();
let lo = _mm_unpacklo_epi16(x, zeros);
let hi = _mm_unpackhi_epi16(x, zeros);
let new_lo = _mm_cvtps_epi32(_mm_add_ps(
_mm_set1_ps(0.5f32),
_mm_mul_ps(_mm_cvtepi32_ps(lo), a_lo_f),
));
let new_hi = _mm_cvtps_epi32(_mm_add_ps(
_mm_set1_ps(0.5f32),
_mm_mul_ps(_mm_cvtepi32_ps(hi), a_hi_f),
));
let pixel = _mm_packs_epi32(new_lo, new_hi);
_mm_select_si128(is_zero_mask, x, pixel)
}
}
#[inline(always)]
pub(crate) fn _mm_div_by_1023_epi32(v: __m128i) -> __m128i {
unsafe {
const DIVIDING_BY: i32 = 10;
let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
let v = _mm_add_epi32(v, addition);
_mm_srli_epi32::<DIVIDING_BY>(_mm_add_epi32(v, _mm_srli_epi32::<DIVIDING_BY>(v)))
}
}
#[inline(always)]
pub(crate) fn _mm_div_by_4095_epi32(v: __m128i) -> __m128i {
unsafe {
const DIVIDING_BY: i32 = 12;
let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
let v = _mm_add_epi32(v, addition);
_mm_srli_epi32::<DIVIDING_BY>(_mm_add_epi32(v, _mm_srli_epi32::<DIVIDING_BY>(v)))
}
}
#[inline(always)]
pub(crate) fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i {
unsafe {
const DIVIDING_BY: i32 = 16;
let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
let v = _mm_add_epi32(v, addition);
_mm_srli_epi32::<DIVIDING_BY>(_mm_add_epi32(v, _mm_srli_epi32::<DIVIDING_BY>(v)))
}
}
#[inline(always)]
fn _mm_div_by<const BIT_DEPTH: usize>(v: __m128i) -> __m128i {
if BIT_DEPTH == 10 {
_mm_div_by_1023_epi32(v)
} else if BIT_DEPTH == 12 {
_mm_div_by_4095_epi32(v)
} else {
_mm_div_by_65535_epi32(v)
}
}
pub(crate) fn unpremultiply_alpha_sse_rgba_u16(in_place: &mut [u16], bit_depth: usize) {
unsafe {
unpremultiply_alpha_sse_rgba_u16_row_impl(
in_place,
bit_depth,
DisassociateAlphaDefault::default(),
);
}
}
trait DisassociateAlpha {
unsafe fn disassociate(&self, in_place: &mut [u16], bit_depth: usize);
}
#[derive(Default)]
struct DisassociateAlphaDefault {}
impl DisassociateAlphaDefault {
#[inline(always)]
fn disassociate_chunk(&self, in_place: &mut [u16], v_max_colors: __m128, bit_depth: usize) {
unsafe {
let src_ptr = in_place.as_ptr();
let max_colors = (1u32 << bit_depth) - 1;
let v_max_test = _mm_set1_epi16(max_colors as i16);
let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
let is_zero_mask = _mm_cmpeq_epi16(aaaa, _mm_setzero_si128());
let a_lo_f = _mm_mul_ps(
_mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(
aaaa,
_mm_setzero_si128(),
))),
v_max_colors,
);
let a_hi_f = _mm_mul_ps(
_mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(
aaaa,
_mm_setzero_si128(),
))),
v_max_colors,
);
let mut new_rrrr = sse_unpremultiply_row_u16(rrrr, is_zero_mask, a_lo_f, a_hi_f);
let mut new_gggg = sse_unpremultiply_row_u16(gggg, is_zero_mask, a_lo_f, a_hi_f);
let mut new_bbbb = sse_unpremultiply_row_u16(bbbb, is_zero_mask, a_lo_f, a_hi_f);
new_rrrr = _mm_min_epu16(new_rrrr, v_max_test);
new_gggg = _mm_min_epu16(new_gggg, v_max_test);
new_bbbb = _mm_min_epu16(new_bbbb, v_max_test);
let (rgba0, rgba1, rgba2, rgba3) =
sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
let dst_ptr = in_place.as_mut_ptr();
_mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
_mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
_mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
_mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
}
}
}
impl DisassociateAlpha for DisassociateAlphaDefault {
#[target_feature(enable = "sse4.1")]
unsafe fn disassociate(&self, in_place: &mut [u16], bit_depth: usize) {
unsafe {
let max_colors = (1 << bit_depth) - 1;
let v_max_colors = _mm_set1_ps(max_colors as f32);
let mut rem = in_place;
for dst in rem.chunks_exact_mut(8 * 4) {
self.disassociate_chunk(dst, v_max_colors, bit_depth);
}
rem = rem.chunks_exact_mut(8 * 4).into_remainder();
if !rem.is_empty() {
assert!(rem.len() < 8 * 4);
let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
self.disassociate_chunk(&mut buffer, v_max_colors, bit_depth);
std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
}
}
}
}
#[target_feature(enable = "sse4.1")]
fn unpremultiply_alpha_sse_rgba_u16_row_impl(
in_place: &mut [u16],
bit_depth: usize,
executor: impl DisassociateAlpha,
) {
unsafe {
executor.disassociate(in_place, bit_depth);
}
}
#[inline(always)]
fn sse_premultiply_row_u16(
x: __m128i,
a_lo_f: __m128,
a_hi_f: __m128,
v_max_colors_scale: __m128,
) -> __m128i {
unsafe {
let zeros = _mm_setzero_si128();
let lo = _mm_unpacklo_epi16(x, zeros);
let hi = _mm_unpackhi_epi16(x, zeros);
let new_lo = _mm_cvtps_epi32(_mm_mul_ps(
_mm_mul_ps(_mm_cvtepi32_ps(lo), v_max_colors_scale),
a_lo_f,
));
let new_hi = _mm_cvtps_epi32(_mm_mul_ps(
_mm_mul_ps(_mm_cvtepi32_ps(hi), v_max_colors_scale),
a_hi_f,
));
_mm_packs_epi32(new_lo, new_hi)
}
}
pub(crate) fn premultiply_alpha_sse_rgba_u16(dst: &mut [u16], src: &[u16], bit_depth: usize) {
unsafe {
premultiply_alpha_sse_rgba_u16_row_impl(dst, src, bit_depth);
}
}
trait Sse41PremultiplyExecutor {
unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize);
}
#[derive(Default)]
struct Sse41PremultiplyExecutorDefault<const BIT_DEPTH: usize> {}
impl<const BIT_DEPTH: usize> Sse41PremultiplyExecutorDefault<BIT_DEPTH> {
#[inline]
#[target_feature(enable = "sse4.1")]
fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16]) {
unsafe {
let zeros = _mm_setzero_si128();
let src_ptr = src.as_ptr();
let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
let new_rrrr = _mm_packus_epi32(
_mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)),
_mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)),
);
let new_gggg = _mm_packus_epi32(
_mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)),
_mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)),
);
let new_bbbb = _mm_packus_epi32(
_mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)),
_mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)),
);
let (rgba0, rgba1, rgba2, rgba3) =
sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
let dst_ptr = dst.as_mut_ptr();
_mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
_mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
_mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
_mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
}
}
}
impl<const BIT_DEPTH: usize> Sse41PremultiplyExecutor
for Sse41PremultiplyExecutorDefault<BIT_DEPTH>
{
#[target_feature(enable = "sse4.1")]
unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], _: usize) {
unsafe {
let mut rem = dst;
let mut src_rem = src;
for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
self.premultiply_chunk(dst, src);
}
rem = rem.chunks_exact_mut(8 * 4).into_remainder();
src_rem = src_rem.chunks_exact(8 * 4).remainder();
if !rem.is_empty() {
assert!(src_rem.len() < 8 * 4);
assert!(rem.len() < 8 * 4);
assert_eq!(src_rem.len(), rem.len());
let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
let mut dst_buffer: [u16; 8 * 4] = [0u16; 8 * 4];
std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
self.premultiply_chunk(&mut dst_buffer, &buffer);
std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
}
}
}
}
#[derive(Default)]
struct Sse41PremultiplyExecutorAny {}
impl Sse41PremultiplyExecutorAny {
#[inline(always)]
fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16], scale: __m128) {
unsafe {
let src_ptr = src.as_ptr();
let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128()));
let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128()));
let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, scale);
let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, scale);
let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, scale);
let (rgba0, rgba1, rgba2, rgba3) =
sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
let dst_ptr = dst.as_mut_ptr();
_mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
_mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
_mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
_mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
}
}
}
impl Sse41PremultiplyExecutor for Sse41PremultiplyExecutorAny {
#[target_feature(enable = "sse4.1")]
unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
unsafe {
let max_colors = (1 << bit_depth) - 1;
let mut rem = dst;
let mut src_rem = src;
let v_max_colors_scale =
_mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors)));
for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
self.premultiply_chunk(dst, src, v_max_colors_scale);
}
rem = rem.chunks_exact_mut(8 * 4).into_remainder();
src_rem = src_rem.chunks_exact(8 * 4).remainder();
if !rem.is_empty() {
assert!(src_rem.len() < 8 * 4);
assert!(rem.len() < 8 * 4);
assert_eq!(src_rem.len(), rem.len());
let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
let mut dst_buffer: [u16; 8 * 4] = [0u16; 8 * 4];
std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
self.premultiply_chunk(&mut dst_buffer, &buffer, v_max_colors_scale);
std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
}
}
}
}
#[target_feature(enable = "sse4.1")]
fn pma_sse41_rgba16_dispatch(
dst: &mut [u16],
src: &[u16],
bit_depth: usize,
executor: impl Sse41PremultiplyExecutor,
) {
unsafe {
executor.premultiply(dst, src, bit_depth);
}
}
#[target_feature(enable = "sse4.1")]
fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) {
if bit_depth == 10 {
pma_sse41_rgba16_dispatch(
dst,
src,
bit_depth,
Sse41PremultiplyExecutorDefault::<10>::default(),
)
} else if bit_depth == 12 {
pma_sse41_rgba16_dispatch(
dst,
src,
bit_depth,
Sse41PremultiplyExecutorDefault::<12>::default(),
)
} else {
pma_sse41_rgba16_dispatch(dst, src, bit_depth, Sse41PremultiplyExecutorAny::default())
}
}