#![forbid(unsafe_code)]
use crate::WorkloadStrategy;
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
use crate::avx2::{avx_premultiply_alpha_rgba, avx_unpremultiply_alpha_rgba};
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
use crate::neon::{neon_premultiply_alpha_rgba, neon_unpremultiply_alpha_rgba};
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
use crate::sse::*;
#[cfg(all(target_arch = "wasm32", target_feature = "simd128",))]
use crate::wasm32::{wasm_premultiply_alpha_rgba, wasm_unpremultiply_alpha_rgba};
use novtb::{ParallelZonedIterator, TbSliceMut};
use std::sync::OnceLock;
#[inline]
pub(crate) fn div_by_255(v: u16) -> u8 {
((((v + 0x80) >> 8) + v + 0x80) >> 8) as u8
}
pub(crate) fn premultiply_alpha_rgba_row_impl(dst: &mut [u8], src: &[u8]) {
for (dst, src) in dst
.as_chunks_mut::<4>()
.0
.iter_mut()
.zip(src.as_chunks::<4>().0.iter())
{
let a = src[3] as u16;
dst[0] = div_by_255(src[0] as u16 * a);
dst[1] = div_by_255(src[1] as u16 * a);
dst[2] = div_by_255(src[2] as u16 * a);
dst[3] = div_by_255(255 * a);
}
}
pub(crate) fn premultiply_alpha_gray_alpha_row_impl(dst: &mut [u8], src: &[u8]) {
for (dst, src) in dst
.as_chunks_mut::<2>()
.0
.iter_mut()
.zip(src.as_chunks::<2>().0.iter())
{
let a = src[1] as u16;
dst[0] = div_by_255(src[0] as u16 * a);
dst[1] = div_by_255(255 * a);
}
}
fn premultiply_alpha_rgba_impl(dst: &mut [u8], src: &[u8]) {
premultiply_alpha_rgba_row_impl(dst, src);
}
fn premultiply_alpha_gray_alpha_impl(
dst: &mut [u8],
dst_stride: usize,
src: &[u8],
_: usize,
_: usize,
stride: usize,
pool: &novtb::ThreadPool,
) {
dst.tb_par_chunks_mut(dst_stride)
.zip(src.chunks(stride))
.for_each(pool, |(dst, src)| {
premultiply_alpha_gray_alpha_row_impl(dst, src);
});
}
static UNPREMULTIPLICATION_TABLE: OnceLock<Box<[u8; 65536]>> = OnceLock::new();
pub(crate) fn unpremultiplication_table() -> &'static [u8; 65536] {
UNPREMULTIPLICATION_TABLE.get_or_init(|| {
let mut buf = Box::new([0u8; 65536]);
for alpha in 0..256 {
for pixel in 0..256 {
#[allow(clippy::manual_checked_ops)]
if alpha == 0 {
buf[alpha * 255 + pixel] = 0;
} else {
let value = (pixel * 255 + alpha / 2) / alpha;
buf[alpha * 255 + pixel] = if value > 255 { 255 } else { value as u8 };
}
}
}
buf
})
}
#[inline]
pub(crate) fn unpremultiply_alpha_rgba_row_impl(in_place: &mut [u8]) {
let table = unpremultiplication_table();
for dst in in_place.as_chunks_mut::<4>().0.iter_mut() {
let a = dst[3];
let z = a as u16 * 255;
dst[0] = table[(z + dst[0] as u16) as usize];
dst[1] = table[(z + dst[1] as u16) as usize];
dst[2] = table[(z + dst[2] as u16) as usize];
}
}
#[inline]
pub(crate) fn unpremultiply_alpha_gray_alpha_row_impl(in_place: &mut [u8]) {
let table = unpremultiplication_table();
for dst in in_place.as_chunks_mut::<2>().0.iter_mut() {
let a = dst[1];
let z = a as u16 * 255;
dst[0] = table[(z + dst[0] as u16) as usize];
}
}
fn unpremultiply_alpha_rgba_impl(in_place: &mut [u8], _: WorkloadStrategy) {
unpremultiply_alpha_rgba_row_impl(in_place);
}
pub(crate) fn premultiply_alpha_rgba(
dst: &mut [u8],
dst_stride: usize,
src: &[u8],
width: usize,
_: usize,
src_stride: usize,
pool: &novtb::ThreadPool,
) {
let mut _dispatcher: fn(&mut [u8], &[u8]) = premultiply_alpha_rgba_impl;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher = neon_premultiply_alpha_rgba;
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = sse_premultiply_alpha_rgba;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
if std::arch::is_x86_feature_detected!("avx2") {
_dispatcher = avx_premultiply_alpha_rgba;
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
_dispatcher = wasm_premultiply_alpha_rgba;
}
dst.tb_par_chunks_mut(dst_stride)
.zip(src.chunks(src_stride))
.for_each(pool, |(dst, src)| {
_dispatcher(&mut dst[..width * 4], &src[..width * 4]);
});
}
pub(crate) fn premultiply_alpha_gray_alpha(
dst: &mut [u8],
dst_stride: usize,
src: &[u8],
width: usize,
height: usize,
src_stride: usize,
pool: &novtb::ThreadPool,
) {
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(&mut [u8], usize, &[u8], usize, usize, usize, &novtb::ThreadPool) =
premultiply_alpha_gray_alpha_impl;
_dispatcher(dst, dst_stride, src, width, height, src_stride, pool);
}
pub(crate) fn unpremultiply_alpha_rgba(
in_place: &mut [u8],
width: usize,
_: usize,
stride: usize,
pool: &novtb::ThreadPool,
workload_strategy: WorkloadStrategy,
) {
let mut _dispatcher: fn(&mut [u8], WorkloadStrategy) = unpremultiply_alpha_rgba_impl;
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher = neon_unpremultiply_alpha_rgba;
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = sse_unpremultiply_alpha_rgba;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
if std::arch::is_x86_feature_detected!("avx2") {
_dispatcher = avx_unpremultiply_alpha_rgba;
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
_dispatcher = wasm_unpremultiply_alpha_rgba;
}
in_place.tb_par_chunks_mut(stride).for_each(pool, |row| {
_dispatcher(&mut row[..width * 4], workload_strategy);
});
}
fn unpremultiply_alpha_gray_alpha_impl(
in_place: &mut [u8],
width: usize,
_: usize,
stride: usize,
pool: &novtb::ThreadPool,
_: WorkloadStrategy,
) {
in_place.tb_par_chunks_mut(stride).for_each(pool, |row| {
unpremultiply_alpha_gray_alpha_row_impl(&mut row[..width * 2]);
});
}
pub(crate) fn unpremultiply_alpha_gray_alpha(
in_place: &mut [u8],
width: usize,
height: usize,
stride: usize,
pool: &novtb::ThreadPool,
workload_strategy: WorkloadStrategy,
) {
let mut _dispatcher: fn(&mut [u8], usize, usize, usize, &novtb::ThreadPool, WorkloadStrategy) =
unpremultiply_alpha_gray_alpha_impl;
_dispatcher(in_place, width, height, stride, pool, workload_strategy);
}