#![forbid(unsafe_code)]
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
use crate::avx2::{avx_premultiply_alpha_rgba_u16, avx_unpremultiply_alpha_rgba_u16};
use crate::mixed_storage::CpuRound;
#[cfg(all(target_arch = "aarch64", feature = "neon",))]
use crate::neon::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16};
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
use crate::sse::{premultiply_alpha_sse_rgba_u16, unpremultiply_alpha_sse_rgba_u16};
use novtb::{ParallelZonedIterator, TbSliceMut};
#[inline]
pub(crate) fn div_by_1023(v: u32) -> u16 {
let round = 1 << 9;
let v = v.wrapping_add(round);
(((v >> 10).wrapping_add(v)) >> 10) as u16
}
#[inline]
pub(crate) fn div_by_4095(v: u32) -> u16 {
let round = 1 << 11;
let v = v.wrapping_add(round);
(((v >> 12).wrapping_add(v)) >> 12) as u16
}
#[inline]
pub(crate) fn div_by_65535(v: u32) -> u16 {
let round = 1 << 15;
let v_expand = v;
let v = v_expand + round;
(((v >> 16) + v) >> 16) as u16
}
pub(crate) fn premultiply_alpha_rgba_row(dst: &mut [u16], src: &[u16], bit_depth: usize) {
let max_colors = (1u32 << bit_depth) - 1;
if max_colors == 1023 {
for (dst, src) in dst
.as_chunks_mut::<4>()
.0
.iter_mut()
.zip(src.as_chunks::<4>().0.iter())
{
let a = src[3] as u32;
dst[0] = div_by_1023((src[0] as u32).wrapping_mul(a));
dst[1] = div_by_1023((src[1] as u32).wrapping_mul(a));
dst[2] = div_by_1023((src[2] as u32).wrapping_mul(a));
dst[3] = div_by_1023((src[3] as u32).wrapping_mul(1023));
}
} else if max_colors == 4096 {
for (dst, src) in dst
.as_chunks_mut::<4>()
.0
.iter_mut()
.zip(src.as_chunks::<4>().0.iter())
{
let a = src[3] as u32;
dst[0] = div_by_4095((src[0] as u32).wrapping_mul(a));
dst[1] = div_by_4095((src[1] as u32).wrapping_mul(a));
dst[2] = div_by_4095((src[2] as u32).wrapping_mul(a));
dst[3] = div_by_4095((src[3] as u32).wrapping_mul(4095));
}
} else if max_colors == 65535 {
for (dst, src) in dst
.as_chunks_mut::<4>()
.0
.iter_mut()
.zip(src.as_chunks::<4>().0.iter())
{
let a = src[3] as u32;
dst[0] = div_by_65535((src[0] as u32).wrapping_mul(a));
dst[1] = div_by_65535((src[1] as u32).wrapping_mul(a));
dst[2] = div_by_65535((src[2] as u32).wrapping_mul(a));
dst[3] = div_by_65535((src[3] as u32).wrapping_mul(65535));
}
} else {
let recip_max_colors = 1. / max_colors as f32;
for (dst, src) in dst
.as_chunks_mut::<4>()
.0
.iter_mut()
.zip(src.as_chunks::<4>().0.iter())
{
let a = src[3] as u32;
dst[0] = (((src[0] as u32).wrapping_mul(a) as f32 * recip_max_colors).cpu_round()
as u32)
.min(max_colors) as u16;
dst[1] = (((src[1] as u32).wrapping_mul(a) as f32 * recip_max_colors).cpu_round()
as u32)
.min(max_colors) as u16;
dst[2] = (((src[2] as u32).wrapping_mul(a) as f32 * recip_max_colors).cpu_round()
as u32)
.min(max_colors) as u16;
dst[3] = ((a.wrapping_mul(max_colors) as f32 * recip_max_colors).cpu_round() as u32)
.min(max_colors) as u16;
}
}
}
pub(crate) fn premultiply_alpha_gray_alpha_row(dst: &mut [u16], src: &[u16], max_colors: u32) {
if max_colors == 1023 {
for (dst, src) in dst
.as_chunks_mut::<2>()
.0
.iter_mut()
.zip(src.as_chunks::<2>().0.iter())
{
let a = src[1] as u32;
dst[0] = div_by_1023((src[0] as u32).wrapping_mul(a));
dst[1] = div_by_1023(a.wrapping_mul(1023));
}
} else if max_colors == 4096 {
for (dst, src) in dst
.as_chunks_mut::<2>()
.0
.iter_mut()
.zip(src.as_chunks::<2>().0.iter())
{
let a = src[1] as u32;
dst[0] = div_by_4095((src[0] as u32).wrapping_mul(a));
dst[1] = div_by_4095(a.wrapping_mul(4095));
}
} else if max_colors == 65535 {
for (dst, src) in dst
.as_chunks_mut::<2>()
.0
.iter_mut()
.zip(src.as_chunks::<2>().0.iter())
{
let a = src[1] as u32;
dst[0] = div_by_65535((src[0] as u32).wrapping_mul(a));
dst[1] = div_by_65535(a.wrapping_mul(65535));
}
} else {
let recip_max_colors = 1. / max_colors as f32;
for (dst, src) in dst
.as_chunks_mut::<2>()
.0
.iter_mut()
.zip(src.as_chunks::<2>().0.iter())
{
let a = src[1] as u32;
dst[0] = (((src[0] as u32).wrapping_mul(a) as f32 * recip_max_colors).cpu_round()
as u32)
.min(max_colors) as u16;
dst[1] = ((a.wrapping_mul(max_colors) as f32 * recip_max_colors).cpu_round() as u32)
.min(max_colors) as u16;
}
}
}
pub(crate) fn unpremultiply_alpha_rgba_row(in_place: &mut [u16], bit_depth: usize) {
let max_colors = (1 << bit_depth) - 1;
for dst in in_place.as_chunks_mut::<4>().0.iter_mut() {
let a = dst[3] as u32;
if a != 0 {
let a_recip = max_colors as f32 / a as f32;
dst[0] = (dst[0] as f32 * a_recip).cpu_round().min(max_colors as f32) as u16;
dst[1] = (dst[1] as f32 * a_recip).cpu_round().min(max_colors as f32) as u16;
dst[2] = (dst[2] as f32 * a_recip).cpu_round().min(max_colors as f32) as u16;
}
}
}
pub(crate) fn unpremultiply_alpha_gray_alpha_row(in_place: &mut [u16], max_colors: u32) {
for dst in in_place.as_chunks_mut::<2>().0.iter_mut() {
let a = dst[1] as u32;
if a != 0 {
let a_recip = max_colors as f32 / a as f32;
dst[0] = (dst[0] as f32 * a_recip).cpu_round().min(max_colors as f32) as u16;
dst[1] = (a as f32 * a_recip).cpu_round().min(max_colors as f32) as u16;
}
}
}
fn premultiply_alpha_gray_alpha_impl(
dst: &mut [u16],
dst_stride: usize,
src: &[u16],
width: usize,
_: usize,
src_stride: usize,
bit_depth: usize,
pool: &novtb::ThreadPool,
) {
let max_colors = (1 << bit_depth) - 1;
dst.tb_par_chunks_mut(dst_stride)
.zip(src.chunks(src_stride))
.for_each(pool, |(dst, src)| {
premultiply_alpha_gray_alpha_row(&mut dst[..width * 2], &src[..width * 2], max_colors);
});
}
fn unpremultiply_alpha_gray_alpha_impl(
in_place: &mut [u16],
src_stride: usize,
width: usize,
_: usize,
bit_depth: usize,
pool: &novtb::ThreadPool,
) {
let max_colors = (1 << bit_depth) - 1;
in_place
.tb_par_chunks_mut(src_stride)
.for_each(pool, |row| {
unpremultiply_alpha_gray_alpha_row(&mut row[..width * 2], max_colors);
});
}
pub(crate) fn premultiply_alpha_rgba_u16(
dst: &mut [u16],
dst_stride: usize,
src: &[u16],
width: usize,
_: usize,
src_stride: usize,
bit_depth: usize,
pool: &novtb::ThreadPool,
) {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(&mut [u16], &[u16], usize) = premultiply_alpha_rgba_row;
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = premultiply_alpha_sse_rgba_u16;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
if std::arch::is_x86_feature_detected!("avx2") {
_dispatcher = avx_premultiply_alpha_rgba_u16;
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher = neon_premultiply_alpha_rgba_u16;
}
dst.tb_par_chunks_mut(dst_stride)
.zip(src.chunks(src_stride))
.for_each(pool, |(dst, src)| {
_dispatcher(&mut dst[..width * 4], &src[..width * 4], bit_depth);
});
}
pub(crate) fn premultiply_alpha_gray_alpha_u16(
dst: &mut [u16],
dst_stride: usize,
src: &[u16],
width: usize,
height: usize,
src_stride: usize,
bit_depth: usize,
pool: &novtb::ThreadPool,
) {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(
&mut [u16],
usize,
&[u16],
usize,
usize,
usize,
usize,
&novtb::ThreadPool,
) = premultiply_alpha_gray_alpha_impl;
_dispatcher(
dst, dst_stride, src, width, height, src_stride, bit_depth, pool,
);
}
pub(crate) fn unpremultiply_alpha_rgba_u16(
in_place: &mut [u16],
src_stride: usize,
width: usize,
_: usize,
bit_depth: usize,
pool: &novtb::ThreadPool,
) {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(&mut [u16], usize) = unpremultiply_alpha_rgba_row;
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = unpremultiply_alpha_sse_rgba_u16;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
if std::arch::is_x86_feature_detected!("avx2") {
_dispatcher = avx_unpremultiply_alpha_rgba_u16;
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher = neon_unpremultiply_alpha_rgba_u16;
}
in_place
.tb_par_chunks_mut(src_stride)
.for_each(pool, |row| {
_dispatcher(&mut row[..width * 4], bit_depth);
});
}
pub(crate) fn unpremultiply_alpha_gray_alpha_u16(
in_place: &mut [u16],
src_stride: usize,
width: usize,
height: usize,
bit_depth: usize,
pool: &novtb::ThreadPool,
) {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(&mut [u16], usize, usize, usize, usize, &novtb::ThreadPool) =
unpremultiply_alpha_gray_alpha_impl;
_dispatcher(in_place, src_stride, width, height, bit_depth, pool);
}