use crate::alpha_handle_f16::{premultiply_pixel_f16_row, unpremultiply_pixel_f16_row};
use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx};
use crate::sse::{sse_deinterleave_rgba_epi16, sse_interleave_rgba_epi16};
use core::f16;
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
pub(crate) fn sse_premultiply_alpha_rgba_f16(dst: &mut [f16], src: &[f16]) {
unsafe {
sse_premultiply_alpha_rgba_f16_regular(dst, src);
}
}
#[target_feature(enable = "sse4.1")]
fn sse_premultiply_alpha_rgba_f16_regular(dst: &mut [f16], src: &[f16]) {
sse_premultiply_alpha_rgba_row_f16_impl::<false>(dst, src);
}
#[inline(always)]
fn sse_premultiply_alpha_rgba_row_f16_impl<const F16C: bool>(dst: &mut [f16], src: &[f16]) {
unsafe {
let mut rem = dst;
let mut src_rem = src;
for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
let src_ptr = src.as_ptr();
let lane0 = _mm_loadu_si128(src_ptr as *const __m128i);
let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
let low_alpha = _mm_cvtph_psx::<F16C>(pixel.3);
let low_r = _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.0), low_alpha);
let low_g = _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.1), low_alpha);
let low_b = _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.2), low_alpha);
let high_alpha = _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.3));
let high_r = _mm_mul_ps(
_mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.0)),
high_alpha,
);
let high_g = _mm_mul_ps(
_mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.1)),
high_alpha,
);
let high_b = _mm_mul_ps(
_mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.2)),
high_alpha,
);
let r_values =
_mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_r), _mm_cvtps_phx::<F16C>(high_r));
let g_values =
_mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_g), _mm_cvtps_phx::<F16C>(high_g));
let b_values =
_mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_b), _mm_cvtps_phx::<F16C>(high_b));
let dst_ptr = dst.as_mut_ptr();
let (d_lane0, d_lane1, d_lane2, d_lane3) =
sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
_mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0);
_mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1);
_mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2);
_mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3);
}
rem = rem.chunks_exact_mut(8 * 4).into_remainder();
src_rem = src_rem.chunks_exact(8 * 4).remainder();
premultiply_pixel_f16_row(rem, src_rem);
}
}
pub(crate) fn sse_unpremultiply_alpha_rgba_f16(in_place: &mut [f16]) {
unsafe {
if is_x86_feature_detected!("f16c") {
sse_unpremultiply_alpha_rgba_f16_regular(in_place);
} else {
sse_unpremultiply_alpha_rgba_f16c(in_place);
}
}
}
#[target_feature(enable = "sse4.1")]
fn sse_unpremultiply_alpha_rgba_f16_regular(in_place: &mut [f16]) {
sse_unpremultiply_alpha_rgba_f16_row_impl::<false>(in_place);
}
#[target_feature(enable = "sse4.1", enable = "f16c")]
fn sse_unpremultiply_alpha_rgba_f16c(in_place: &mut [f16]) {
sse_unpremultiply_alpha_rgba_f16_row_impl::<true>(in_place);
}
#[inline(always)]
fn sse_unpremultiply_alpha_rgba_f16_row_impl<const F16C: bool>(in_place: &mut [f16]) {
unsafe {
let mut rem = in_place;
for dst in rem.chunks_exact_mut(8 * 4) {
let src_ptr = dst.as_ptr();
let lane0 = _mm_loadu_si128(src_ptr as *const __m128i);
let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
let low_alpha = _mm_cvtph_psx::<F16C>(pixel.3);
let zeros = _mm_setzero_ps();
let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros);
let low_r = _mm_blendv_ps(
_mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.0), low_alpha),
zeros,
low_alpha_zero_mask,
);
let low_g = _mm_blendv_ps(
_mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.1), low_alpha),
zeros,
low_alpha_zero_mask,
);
let low_b = _mm_blendv_ps(
_mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.2), low_alpha),
zeros,
low_alpha_zero_mask,
);
let high_alpha = _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.3));
let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros);
let high_r = _mm_blendv_ps(
_mm_mul_ps(
_mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.0)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let high_g = _mm_blendv_ps(
_mm_mul_ps(
_mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.1)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let high_b = _mm_blendv_ps(
_mm_mul_ps(
_mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.2)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let r_values =
_mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_r), _mm_cvtps_phx::<F16C>(high_r));
let g_values =
_mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_g), _mm_cvtps_phx::<F16C>(high_g));
let b_values =
_mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_b), _mm_cvtps_phx::<F16C>(high_b));
let dst_ptr = dst.as_mut_ptr();
let (d_lane0, d_lane1, d_lane2, d_lane3) =
sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
_mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0);
_mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1);
_mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2);
_mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3);
}
rem = rem.chunks_exact_mut(8 * 4).into_remainder();
unpremultiply_pixel_f16_row(rem);
}
}