use crate::WorkloadStrategy;
use crate::sse::{sse_deinterleave_rgba, sse_interleave_rgba};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[inline(always)]
pub(crate) fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __m128i) -> __m128i {
unsafe { _mm_blendv_epi8(false_vals, true_vals, mask) }
}
#[inline(always)]
pub(crate) fn _mm_div_by_255_epi16(v: __m128i) -> __m128i {
unsafe {
let addition = _mm_set1_epi16(127);
let j0 = _mm_add_epi16(v, addition);
let j1 = _mm_srli_epi16::<8>(v);
_mm_srli_epi16::<8>(_mm_add_epi16(j0, j1))
}
}
#[inline(always)]
pub(crate) fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
unsafe {
let zeros = _mm_setzero_si128();
let lo = _mm_unpacklo_epi8(x, zeros);
let hi = _mm_unpackhi_epi8(x, zeros);
let is_zero_mask = _mm_cmpeq_epi8(a, zeros);
let scale_ps = _mm_set1_ps(255f32);
let llw = _mm_unpacklo_epi16(lo, zeros);
let lhw = _mm_unpackhi_epi16(lo, zeros);
let hlw = _mm_unpacklo_epi16(hi, zeros);
let hhw = _mm_unpackhi_epi16(hi, zeros);
let llwc = _mm_cvtepi32_ps(llw);
let lhwc = _mm_cvtepi32_ps(lhw);
let hlwc = _mm_cvtepi32_ps(hlw);
let hhwc = _mm_cvtepi32_ps(hhw);
let lo_lo = _mm_mul_ps(llwc, scale_ps);
let lo_hi = _mm_mul_ps(lhwc, scale_ps);
let hi_lo = _mm_mul_ps(hlwc, scale_ps);
let hi_hi = _mm_mul_ps(hhwc, scale_ps);
let a_lo = _mm_unpacklo_epi8(a, zeros);
let a_hi = _mm_unpackhi_epi8(a, zeros);
let allw = _mm_unpacklo_epi16(a_lo, zeros);
let alhw = _mm_unpackhi_epi16(a_lo, zeros);
let ahlw = _mm_unpacklo_epi16(a_hi, zeros);
let ahhw = _mm_unpackhi_epi16(a_hi, zeros);
let allf = _mm_cvtepi32_ps(allw);
let alhf = _mm_cvtepi32_ps(alhw);
let ahlf = _mm_cvtepi32_ps(ahlw);
let ahhf = _mm_cvtepi32_ps(ahhw);
let a_lo_lo = _mm_rcp_ps(allf);
let a_lo_hi = _mm_rcp_ps(alhf);
let a_hi_lo = _mm_rcp_ps(ahlf);
let a_hi_hi = _mm_rcp_ps(ahhf);
let fllw = _mm_mul_ps(lo_lo, a_lo_lo);
let flhw = _mm_mul_ps(lo_hi, a_lo_hi);
let fhlw = _mm_mul_ps(hi_lo, a_hi_lo);
let fhhw = _mm_mul_ps(hi_hi, a_hi_hi);
let lo_lo = _mm_cvtps_epi32(fllw);
let lo_hi = _mm_cvtps_epi32(flhw);
let hi_lo = _mm_cvtps_epi32(fhlw);
let hi_hi = _mm_cvtps_epi32(fhhw);
let lo = _mm_packs_epi32(lo_lo, lo_hi);
let hi = _mm_packs_epi32(hi_lo, hi_hi);
_mm_select_si128(is_zero_mask, _mm_setzero_si128(), _mm_packus_epi16(lo, hi))
}
}
pub(crate) fn sse_premultiply_alpha_rgba(dst: &mut [u8], src: &[u8]) {
unsafe {
sse_premultiply_alpha_rgba_impl(dst, src);
}
}
trait Sse41PremultiplyExecutorRgba8 {
unsafe fn premultiply(&self, dst: &mut [u8], src: &[u8]);
}
#[derive(Default)]
struct Sse41PremultiplyExecutor8Default {}
impl Sse41PremultiplyExecutor8Default {
#[inline(always)]
fn premultiply_chunk(&self, dst: &mut [u8], src: &[u8]) {
unsafe {
let zeros = _mm_setzero_si128();
let src_ptr = src.as_ptr();
let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
rrr_low = _mm_mullo_epi16(rrr_low, aaa_low);
rrr_high = _mm_mullo_epi16(rrr_high, aaa_high);
ggg_low = _mm_mullo_epi16(ggg_low, aaa_low);
ggg_high = _mm_mullo_epi16(ggg_high, aaa_high);
bbb_low = _mm_mullo_epi16(bbb_low, aaa_low);
bbb_high = _mm_mullo_epi16(bbb_high, aaa_high);
rrr_low = _mm_div_by_255_epi16(rrr_low);
rrr_high = _mm_div_by_255_epi16(rrr_high);
ggg_low = _mm_div_by_255_epi16(ggg_low);
ggg_high = _mm_div_by_255_epi16(ggg_high);
bbb_low = _mm_div_by_255_epi16(bbb_low);
bbb_high = _mm_div_by_255_epi16(bbb_high);
let rrr = _mm_packus_epi16(rrr_low, rrr_high);
let ggg = _mm_packus_epi16(ggg_low, ggg_high);
let bbb = _mm_packus_epi16(bbb_low, bbb_high);
let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
let dst_ptr = dst.as_mut_ptr();
_mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
_mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
_mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
_mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
}
}
}
impl Sse41PremultiplyExecutorRgba8 for Sse41PremultiplyExecutor8Default {
#[target_feature(enable = "sse4.1")]
unsafe fn premultiply(&self, dst: &mut [u8], src: &[u8]) {
unsafe {
let mut rem = dst;
let mut src_rem = src;
for (dst, src) in rem
.chunks_exact_mut(16 * 4)
.zip(src_rem.chunks_exact(16 * 4))
{
self.premultiply_chunk(dst, src);
}
rem = rem.chunks_exact_mut(16 * 4).into_remainder();
src_rem = src_rem.chunks_exact(16 * 4).remainder();
if !rem.is_empty() {
const PART_SIZE: usize = 16 * 4;
assert!(src_rem.len() < PART_SIZE);
assert!(rem.len() < PART_SIZE);
assert_eq!(src_rem.len(), rem.len());
let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
let mut dst_buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
self.premultiply_chunk(&mut dst_buffer, &buffer);
std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
}
}
}
}
#[target_feature(enable = "sse4.1")]
fn sse_premultiply_alpha_rgba_impl_row(
dst: &mut [u8],
src: &[u8],
executor: impl Sse41PremultiplyExecutorRgba8,
) {
unsafe {
executor.premultiply(dst, src);
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
fn sse_premultiply_alpha_rgba_impl(dst: &mut [u8], src: &[u8]) {
sse_premultiply_alpha_rgba_impl_row(dst, src, Sse41PremultiplyExecutor8Default::default());
}
pub(crate) fn sse_unpremultiply_alpha_rgba(in_place: &mut [u8], _: WorkloadStrategy) {
unsafe {
sse_unpremultiply_alpha_rgba_impl(in_place);
}
}
trait DisassociateAlpha {
unsafe fn disassociate(&self, in_place: &mut [u8]);
}
#[derive(Default)]
struct DisassociateAlphaDefault {}
impl DisassociateAlphaDefault {
#[inline(always)]
fn disassociate_chunk(&self, in_place: &mut [u8]) {
unsafe {
let src_ptr = in_place.as_ptr();
let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
let rrr = sse_unpremultiply_row(rrr, aaa);
let ggg = sse_unpremultiply_row(ggg, aaa);
let bbb = sse_unpremultiply_row(bbb, aaa);
let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
let dst_ptr = in_place.as_mut_ptr();
_mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
_mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
_mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
_mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
}
}
}
impl DisassociateAlpha for DisassociateAlphaDefault {
#[target_feature(enable = "sse4.1")]
unsafe fn disassociate(&self, in_place: &mut [u8]) {
unsafe {
let mut rem = in_place;
for dst in rem.chunks_exact_mut(16 * 4) {
self.disassociate_chunk(dst);
}
rem = rem.chunks_exact_mut(16 * 4).into_remainder();
if !rem.is_empty() {
const PART_SIZE: usize = 16 * 4;
assert!(rem.len() < PART_SIZE);
let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
self.disassociate_chunk(&mut buffer);
std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
}
}
}
}
#[target_feature(enable = "sse4.1")]
fn sse_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8], executor: impl DisassociateAlpha) {
unsafe {
executor.disassociate(in_place);
}
}
#[target_feature(enable = "sse4.1")]
fn sse_unpremultiply_alpha_rgba_impl(in_place: &mut [u8]) {
sse_unpremultiply_alpha_rgba_impl_row(in_place, DisassociateAlphaDefault::default());
}