use pix::{Mask8, Rgb, Translucent, Associated};
use crate::Blend;
#[cfg(all(target_arch = "x86", feature = "simd"))]
use std::arch::x86::*;
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
use std::arch::x86_64::*;
impl Blend for Rgb<Ch8, Translucent<Ch8>, Associated> {
fn mask_over(dst: &mut [Self], mask: &[u8], clr: Self) {
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
feature = "simd"))]
{
if is_x86_feature_detected!("ssse3") {
let len = dst.len().min(mask.len());
if len >= 4 {
unsafe { over_x86(dst, mask, clr) }
}
let ln = (len >> 2) << 2;
if len > ln {
Self::mask_over_fallback(&mut dst[ln..], &mask[ln..], clr);
}
return;
}
}
Blend::mask_over_fallback(dst, mask, clr);
}
fn mask_over_fallback(dst: &mut [Self], mask: &[u8], src: Self) {
for (bot, m) in dst.iter_mut().zip(mask) {
*bot = src.with_alpha_over(*bot, *m);
}
}
}
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
feature = "simd"))]
#[target_feature(enable = "ssse3")]
unsafe fn over_x86(pix: &mut [Rgb<Ch8>], mask: &[u8], clr: Rgb<Ch8>) {
let len = pix.len().min(mask.len());
let len = (len >> 2) << 2;
let clr = _mm_set1_epi32(clr.into());
let src = mask.as_ptr();
let dst = pix.as_mut_ptr();
for i in (0..len).step_by(4) {
let off = i as isize;
let dst = dst.offset(off) as *mut __m128i;
let src = src.offset(off) as *const i32;
let alpha = swizzle_mask_x86(_mm_set1_epi32(*src));
let bot = _mm_loadu_si128(dst);
let out = over_alpha_u8x16_x86(clr, bot, alpha);
_mm_storeu_si128(dst, out);
}
}
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
feature = "simd"))]
#[target_feature(enable = "ssse3")]
unsafe fn swizzle_mask_x86(v: __m128i) -> __m128i {
_mm_shuffle_epi8(v, _mm_set_epi8(3, 3, 3, 3,
2, 2, 2, 2,
1, 1, 1, 1,
0, 0, 0, 0))
}
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
feature = "simd"))]
#[target_feature(enable = "ssse3")]
unsafe fn over_alpha_u8x16_x86(t: __m128i, b: __m128i, a: __m128i) -> __m128i {
let t_even = _mm_unpacklo_epi8(t, _mm_setzero_si128());
let b_even = _mm_unpacklo_epi8(b, _mm_setzero_si128());
let a_even = _mm_unpacklo_epi8(a, _mm_setzero_si128());
let a_even = _mm_srli_epi16(a_even, 1);
let even = _mm_mullo_epi16(a_even, _mm_sub_epi16(t_even, b_even));
let even = scale_i16_to_u8_x86(even);
let even = _mm_add_epi16(b_even, even);
let t_odd = _mm_unpackhi_epi8(t, _mm_setzero_si128());
let b_odd = _mm_unpackhi_epi8(b, _mm_setzero_si128());
let a_odd = _mm_unpackhi_epi8(a, _mm_setzero_si128());
let a_odd = _mm_srli_epi16(a_odd, 1);
let odd = _mm_mullo_epi16(a_odd, _mm_sub_epi16(t_odd, b_odd));
let odd = scale_i16_to_u8_x86(odd);
let odd = _mm_add_epi16(b_odd, odd);
_mm_packus_epi16(even, odd)
}
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
feature = "simd"))]
#[target_feature(enable = "ssse3")]
unsafe fn scale_i16_to_u8_x86(v: __m128i) -> __m128i {
_mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(v,
_mm_set1_epi16(1)),
_mm_srai_epi16(v, 8)),
7)
}