#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
const ZEROED: __m128i = unsafe { core::mem::transmute([0_u64; 2]) };
#[inline]
fn m128i_as_mut_u8s(m: &mut __m128i) -> &mut [u8] {
let data = m as *mut __m128i as *mut u8;
let len = core::mem::size_of::<__m128i>() / core::mem::size_of::<u8>();
unsafe { core::slice::from_raw_parts_mut(data, len) }
}
#[inline]
fn m128i_as_mut_i16s(m: &mut __m128i) -> &mut [i16] {
let data = m as *mut __m128i as *mut i16;
let len = core::mem::size_of::<__m128i>() / core::mem::size_of::<i16>();
unsafe { core::slice::from_raw_parts_mut(data, len) }
}
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn blend_tf_sse2(cond: __m128i, t: __m128i, f: __m128i) -> __m128i {
_mm_xor_si128(f, _mm_and_si128(_mm_xor_si128(f, t), cond))
}
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn i16_abs_sse2(x: __m128i) -> __m128i {
let is_negative = _mm_cmplt_epi16(x, ZEROED);
blend_tf_sse2(is_negative, _mm_sub_epi16(ZEROED, x), x)
}
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn i16_le_sse2(a: __m128i, b: __m128i) -> __m128i {
let lt = _mm_cmplt_epi16(a, b);
let eq = _mm_cmpeq_epi16(a, b);
_mm_or_si128(lt, eq)
}
#[target_feature(enable = "sse2")]
pub unsafe fn recon_sub<const BYTES_PER_PIXEL: usize>(filtered_row: &mut [u8]) {
assert!(BYTES_PER_PIXEL <= 8);
debug_assert_eq!(filtered_row.len() % BYTES_PER_PIXEL, 0);
let mut a: __m128i = ZEROED;
filtered_row.chunks_exact_mut(BYTES_PER_PIXEL).for_each(|chunk| {
let mut x: __m128i = ZEROED;
m128i_as_mut_u8s(&mut x)[..BYTES_PER_PIXEL].copy_from_slice(chunk);
x = unsafe { _mm_add_epi8(x, a) };
chunk.copy_from_slice(&m128i_as_mut_u8s(&mut x)[..BYTES_PER_PIXEL]);
a = x;
})
}
#[target_feature(enable = "sse2")]
pub unsafe fn recon_up(filtered_row: &mut [u8], previous_row: &[u8]) {
debug_assert_eq!(filtered_row.len(), previous_row.len());
filtered_row.iter_mut().zip(previous_row.iter()).for_each(|(x, b)| *x = x.wrapping_add(*b))
}
#[target_feature(enable = "sse2")]
pub unsafe fn recon_average<const BYTES_PER_PIXEL: usize>(
filtered_row: &mut [u8], previous_row: &[u8],
) {
assert!(BYTES_PER_PIXEL <= 8);
debug_assert_eq!(filtered_row.len() % BYTES_PER_PIXEL, 0);
debug_assert_eq!(filtered_row.len(), previous_row.len());
let mut a: __m128i = ZEROED; filtered_row
.chunks_exact_mut(BYTES_PER_PIXEL)
.zip(previous_row.chunks_exact(BYTES_PER_PIXEL))
.for_each(|(x_chunk, b_chunk)| {
let mut x: __m128i = ZEROED; m128i_as_mut_u8s(&mut x)[..BYTES_PER_PIXEL].copy_from_slice(x_chunk);
let mut b: __m128i = ZEROED; m128i_as_mut_i16s(&mut b).iter_mut().zip(b_chunk.iter()).for_each(|(j, k)| *j = *k as i16);
{
let average = _mm_srai_epi16(_mm_add_epi16(a, b), 1);
let average_u8 = _mm_packus_epi16(average, ZEROED);
x = _mm_add_epi8(x, average_u8);
}
x_chunk.copy_from_slice(&m128i_as_mut_u8s(&mut x)[..BYTES_PER_PIXEL]);
a = _mm_unpacklo_epi8(x, ZEROED);
})
}
#[target_feature(enable = "sse2")]
pub unsafe fn recon_average_top<const BYTES_PER_PIXEL: usize>(filtered_row: &mut [u8]) {
assert!(BYTES_PER_PIXEL <= 8);
debug_assert_eq!(filtered_row.len() % BYTES_PER_PIXEL, 0);
let mut a: __m128i = ZEROED; filtered_row.chunks_exact_mut(BYTES_PER_PIXEL).for_each(|chunk| {
let mut x: __m128i = ZEROED; m128i_as_mut_u8s(&mut x)[..BYTES_PER_PIXEL].copy_from_slice(chunk);
{
let half_a = _mm_srai_epi16(a, 1);
let half_a_u8 = _mm_packus_epi16(half_a, ZEROED);
x = _mm_add_epi8(x, half_a_u8);
}
chunk.copy_from_slice(&m128i_as_mut_u8s(&mut x)[..BYTES_PER_PIXEL]);
a = _mm_unpacklo_epi8(x, ZEROED);
})
}
#[target_feature(enable = "sse2")]
pub unsafe fn recon_paeth<const BYTES_PER_PIXEL: usize>(
filtered_row: &mut [u8], previous_row: &[u8],
) {
assert!(BYTES_PER_PIXEL <= 8);
debug_assert_eq!(filtered_row.len() % BYTES_PER_PIXEL, 0);
debug_assert_eq!(filtered_row.len(), previous_row.len());
let mut a: __m128i = ZEROED; let mut c: __m128i = ZEROED; filtered_row
.chunks_exact_mut(BYTES_PER_PIXEL)
.zip(previous_row.chunks_exact(BYTES_PER_PIXEL))
.for_each(|(x_chunk, b_chunk)| {
let mut x: __m128i = ZEROED; m128i_as_mut_u8s(&mut x)[..BYTES_PER_PIXEL].copy_from_slice(x_chunk);
let mut b: __m128i = ZEROED; m128i_as_mut_i16s(&mut b).iter_mut().zip(b_chunk.iter()).for_each(|(j, k)| *j = *k as i16);
{
let p = _mm_sub_epi16(_mm_add_epi16(a, b), c);
let pa = i16_abs_sse2(_mm_sub_epi16(p, a));
let pb = i16_abs_sse2(_mm_sub_epi16(p, b));
let pc = i16_abs_sse2(_mm_sub_epi16(p, c));
let pa_le_pb = i16_le_sse2(pa, pb);
let pa_le_pc = i16_le_sse2(pa, pc);
let pa_le_pb_and_pa_le_pc = _mm_and_si128(pa_le_pb, pa_le_pc);
let pb_le_pc = i16_le_sse2(pb, pc);
let pick_b_or_c = blend_tf_sse2(pb_le_pc, b, c);
let paeth16 = blend_tf_sse2(pa_le_pb_and_pa_le_pc, a, pick_b_or_c);
let paeth = _mm_packus_epi16(paeth16, ZEROED);
x = _mm_add_epi8(x, paeth);
}
x_chunk.copy_from_slice(&m128i_as_mut_u8s(&mut x)[..BYTES_PER_PIXEL]);
a = _mm_unpacklo_epi8(x, ZEROED);
c = b;
})
}