use crate::alpha_handle_f32::{premultiply_rgba_f32_row, unpremultiply_rgba_f32_row};
use crate::sse::{sse_deinterleave_rgba_ps, sse_interleave_rgba_ps};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[inline(always)]
fn sse_unpremultiply_row_f32(x: __m128, a: __m128) -> __m128 {
unsafe {
let is_zero_mask = _mm_cmpeq_ps(a, _mm_setzero_ps());
let rs = _mm_div_ps(x, a);
_mm_blendv_ps(rs, _mm_setzero_ps(), is_zero_mask)
}
}
pub(crate) fn sse_unpremultiply_alpha_rgba_f32(in_place: &mut [f32]) {
unsafe {
sse_unpremultiply_alpha_rgba_f32_row_impl(in_place);
}
}
#[target_feature(enable = "sse4.1")]
fn sse_unpremultiply_alpha_rgba_f32_row_impl(in_place: &mut [f32]) {
unsafe {
for dst in in_place.chunks_exact_mut(4 * 4) {
let src_ptr = dst.as_ptr();
let rgba0 = _mm_loadu_ps(src_ptr);
let rgba1 = _mm_loadu_ps(src_ptr.add(4));
let rgba2 = _mm_loadu_ps(src_ptr.add(8));
let rgba3 = _mm_loadu_ps(src_ptr.add(12));
let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);
let rrr = sse_unpremultiply_row_f32(rrr, aaa);
let ggg = sse_unpremultiply_row_f32(ggg, aaa);
let bbb = sse_unpremultiply_row_f32(bbb, aaa);
let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba_ps(rrr, ggg, bbb, aaa);
let dst_ptr = dst.as_mut_ptr();
_mm_storeu_ps(dst_ptr, rgba0);
_mm_storeu_ps(dst_ptr.add(4), rgba1);
_mm_storeu_ps(dst_ptr.add(8), rgba2);
_mm_storeu_ps(dst_ptr.add(12), rgba3);
}
let rem = in_place.chunks_exact_mut(4 * 4).into_remainder();
unpremultiply_rgba_f32_row(rem);
}
}
pub(crate) fn sse_premultiply_alpha_rgba_f32(dst: &mut [f32], src: &[f32]) {
unsafe {
sse_premultiply_alpha_rgba_f32_row_impl(dst, src);
}
}
#[target_feature(enable = "sse4.1")]
fn sse_premultiply_alpha_rgba_f32_row_impl(dst: &mut [f32], src: &[f32]) {
unsafe {
let mut rem = dst;
let mut src_rem = src;
for (dst, src) in rem.chunks_exact_mut(4 * 4).zip(src_rem.chunks_exact(4 * 4)) {
let src_ptr = src.as_ptr();
let rgba0 = _mm_loadu_ps(src_ptr);
let rgba1 = _mm_loadu_ps(src_ptr.add(4));
let rgba2 = _mm_loadu_ps(src_ptr.add(8));
let rgba3 = _mm_loadu_ps(src_ptr.add(12));
let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);
let rrr = _mm_mul_ps(rrr, aaa);
let ggg = _mm_mul_ps(ggg, aaa);
let bbb = _mm_mul_ps(bbb, aaa);
let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba_ps(rrr, ggg, bbb, aaa);
let dst_ptr = dst.as_mut_ptr();
_mm_storeu_ps(dst_ptr, rgba0);
_mm_storeu_ps(dst_ptr.add(4), rgba1);
_mm_storeu_ps(dst_ptr.add(8), rgba2);
_mm_storeu_ps(dst_ptr.add(12), rgba3);
}
rem = rem.chunks_exact_mut(4 * 4).into_remainder();
src_rem = src_rem.chunks_exact(4 * 4).remainder();
premultiply_rgba_f32_row(rem, src_rem);
}
}