use png::ColorType;
use pixel::{PixFmt,lerp_u8};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
const X86: bool = cfg!(any(target_arch="x86", target_arch="x86_64"));
#[derive(Clone,Copy,Debug,Default)]
#[repr(C)]
pub struct Rgba8 {
red: u8,
green: u8,
blue: u8,
alpha: u8,
}
impl From<Rgba8> for i32 {
fn from(c: Rgba8) -> i32 {
let red = (c.red() as i32) << 0;
let green = (c.green() as i32) << 8;
let blue = (c.blue() as i32) << 16;
let alpha = (c.alpha() as i32) << 24;
red | green | blue | alpha
}
}
impl Rgba8 {
pub fn new(red: u8, green: u8, blue: u8, alpha: u8) -> Self {
Rgba8 { red, green, blue, alpha }
}
pub fn rgb(red: u8, green: u8, blue: u8) -> Self {
Rgba8::new(red, green, blue, 0xFF)
}
fn divide_alpha(self) -> Self {
let alpha = self.alpha();
let red = unscale_u8(self.red(), alpha);
let green = unscale_u8(self.green(), alpha);
let blue = unscale_u8(self.blue(), alpha);
Rgba8::new(red, green, blue, alpha)
}
pub fn red(self) -> u8 {
self.red
}
pub fn green(self) -> u8 {
self.green
}
pub fn blue(self) -> u8 {
self.blue
}
pub fn alpha(self) -> u8 {
self.alpha
}
fn over_alpha(self, bot: Rgba8, alpha: u8) -> Self {
let red = lerp_u8(self.red(), bot.red(), alpha);
let green = lerp_u8(self.green(), bot.green(), alpha);
let blue = lerp_u8(self.blue(), bot.blue(), alpha);
let alpha = lerp_u8(self.alpha(), bot.alpha(), alpha);
Rgba8::new(red, green, blue, alpha)
}
}
fn unscale_u8(a: u8, b: u8) -> u8 {
if b > 0 {
let aa = (a as u32) << 8;
let bb = b as u32;
(aa / bb).min(255) as u8
} else {
0
}
}
impl PixFmt for Rgba8 {
fn color_type() -> ColorType {
ColorType::RGBA
}
fn over(pix: &mut [Self], mask: &[u8], clr: Self) {
debug_assert_eq!(pix.len(), mask.len());
if X86 && is_x86_feature_detected!("ssse3") {
unsafe { over_x86(pix, mask, clr) }
} else {
over_fallback(pix, mask, clr);
}
}
fn divide_alpha(pix: &mut [Self]) {
for p in pix.iter_mut() {
*p = p.divide_alpha();
}
}
}
#[cfg(any(target_arch="x86", target_arch="x86_64"))]
unsafe fn over_x86(pix: &mut [Rgba8], mask: &[u8], clr: Rgba8) {
debug_assert_eq!(pix.len(), mask.len());
let len = pix.len();
let clr = _mm_set1_epi32(clr.into());
let src = mask.as_ptr();
let dst = pix.as_mut_ptr();
for i in (0..len).step_by(4) {
let off = i as isize;
let dst = dst.offset(off) as *mut __m128i;
let src = src.offset(off) as *const i32;
let alpha = swizzle_mask_x86(_mm_set1_epi32(*src));
let bot = _mm_loadu_si128(dst);
let out = over_alpha_u8x16_x86(clr, bot, alpha);
_mm_storeu_si128(dst, out);
}
}
#[cfg(any(target_arch="x86", target_arch="x86_64"))]
unsafe fn swizzle_mask_x86(v: __m128i) -> __m128i {
_mm_shuffle_epi8(v, _mm_set_epi8(3, 3, 3, 3,
2, 2, 2, 2,
1, 1, 1, 1,
0, 0, 0, 0))
}
#[cfg(any(target_arch="x86", target_arch="x86_64"))]
unsafe fn over_alpha_u8x16_x86(t: __m128i, b: __m128i, a: __m128i) -> __m128i {
let t_even = _mm_unpacklo_epi8(t, _mm_setzero_si128());
let b_even = _mm_unpacklo_epi8(b, _mm_setzero_si128());
let a_even = _mm_unpacklo_epi8(a, _mm_setzero_si128());
let a_even = _mm_srli_epi16(a_even, 1);
let even = _mm_mullo_epi16(a_even, _mm_sub_epi16(t_even, b_even));
let even = scale_i16_to_u8_x86(even);
let even = _mm_add_epi16(b_even, even);
let t_odd = _mm_unpackhi_epi8(t, _mm_setzero_si128());
let b_odd = _mm_unpackhi_epi8(b, _mm_setzero_si128());
let a_odd = _mm_unpackhi_epi8(a, _mm_setzero_si128());
let a_odd = _mm_srli_epi16(a_odd, 1);
let odd = _mm_mullo_epi16(a_odd, _mm_sub_epi16(t_odd, b_odd));
let odd = scale_i16_to_u8_x86(odd);
let odd = _mm_add_epi16(b_odd, odd);
_mm_packus_epi16(even, odd)
}
#[cfg(any(target_arch="x86", target_arch="x86_64"))]
unsafe fn scale_i16_to_u8_x86(v: __m128i) -> __m128i {
_mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(v,
_mm_set1_epi16(1)),
_mm_srai_epi16(v, 8)),
7)
}
fn over_fallback(pix: &mut [Rgba8], mask: &[u8], clr: Rgba8) {
for (bot, m) in pix.iter_mut().zip(mask) {
let mut out = clr.over_alpha(*bot, *m);
*bot = out;
}
}