use core::arch::x86_64::*;
use crate::convolution::optimisations::Normalizer16;
use crate::pixels::U8x2;
use crate::{simd_utils, ImageView, ImageViewMut};
#[inline]
pub(crate) fn horiz_convolution(
src_view: &impl ImageView<Pixel = U8x2>,
dst_view: &mut impl ImageViewMut<Pixel = U8x2>,
offset: u32,
normalizer: &Normalizer16,
) {
let dst_height = dst_view.height();
let src_iter = src_view.iter_4_rows(offset, dst_height + offset);
let dst_iter = dst_view.iter_4_rows_mut();
for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
unsafe {
horiz_convolution_four_rows(src_rows, dst_rows, normalizer);
}
}
let yy = dst_height - dst_height % 4;
let src_rows = src_view.iter_rows(yy + offset);
let dst_rows = dst_view.iter_rows_mut(yy);
for (src_row, dst_row) in src_rows.zip(dst_rows) {
unsafe {
horiz_convolution_one_row(src_row, dst_row, normalizer);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn horiz_convolution_four_rows(
src_rows: [&[U8x2]; 4],
dst_rows: [&mut [U8x2]; 4],
normalizer: &Normalizer16,
) {
let precision = normalizer.precision();
let initial = _mm_set1_epi32(1 << (precision - 2));
#[rustfmt::skip]
let sh1 = _mm_set_epi8(
-1, 7, -1, 5, -1, 3, -1, 1, -1, 6, -1, 4, -1, 2, -1, 0,
);
#[rustfmt::skip]
let sh2 = _mm_set_epi8(
-1, 15, -1, 13, -1, 11, -1, 9, -1, 14, -1, 12, -1, 10, -1, 8,
);
for (dst_x, chunk) in normalizer.chunks().iter().enumerate() {
let mut x = chunk.start as usize;
let coeffs = chunk.values();
let mut sss: [__m128i; 4] = [initial; 4];
let coeffs_by_8 = coeffs.chunks_exact(8);
let reminder = coeffs_by_8.remainder();
for k in coeffs_by_8 {
let mmk0 = simd_utils::ptr_i16_to_set1_epi64x(k, 0);
let mmk1 = simd_utils::ptr_i16_to_set1_epi64x(k, 4);
for i in 0..4 {
let source = simd_utils::loadu_si128(src_rows[i], x);
let pix = _mm_shuffle_epi8(source, sh1);
let tmp_sum = _mm_add_epi32(sss[i], _mm_madd_epi16(pix, mmk0));
let pix = _mm_shuffle_epi8(source, sh2);
sss[i] = _mm_add_epi32(tmp_sum, _mm_madd_epi16(pix, mmk1));
}
x += 8;
}
let coeffs_by_4 = reminder.chunks_exact(4);
let reminder = coeffs_by_4.remainder();
for k in coeffs_by_4 {
let mmk = simd_utils::ptr_i16_to_set1_epi64x(k, 0);
for i in 0..4 {
let source = simd_utils::loadl_epi64(src_rows[i], x);
let pix = _mm_shuffle_epi8(source, sh1);
sss[i] = _mm_add_epi32(sss[i], _mm_madd_epi16(pix, mmk));
}
x += 4;
}
let coeffs_by_2 = reminder.chunks_exact(2);
let reminder = coeffs_by_2.remainder();
for k in coeffs_by_2 {
let mmk = simd_utils::mm_load_and_clone_i16x2(k);
for i in 0..4 {
let source = simd_utils::loadl_epi32(src_rows[i], x);
let pix = _mm_shuffle_epi8(source, sh1);
sss[i] = _mm_add_epi32(sss[i], _mm_madd_epi16(pix, mmk));
}
x += 2;
}
if let Some(&k) = reminder.first() {
let mmk = _mm_set1_epi32(k as i32);
for i in 0..4 {
let source = simd_utils::loadl_epi16(src_rows[i], x);
let pix = _mm_shuffle_epi8(source, sh1);
sss[i] = _mm_add_epi32(sss[i], _mm_madd_epi16(pix, mmk));
}
}
for i in 0..4 {
set_dst_pixel(sss[i], dst_rows[i], dst_x, normalizer);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn set_dst_pixel(raw: __m128i, d_row: &mut [U8x2], dst_x: usize, normalizer: &Normalizer16) {
let l32x2 = _mm_extract_epi64::<0>(raw);
let a32x2 = _mm_extract_epi64::<1>(raw);
let l32 = ((l32x2 >> 32) as i32).saturating_add((l32x2 & 0xffffffff) as i32);
let a32 = ((a32x2 >> 32) as i32).saturating_add((a32x2 & 0xffffffff) as i32);
let l8 = normalizer.clip(l32);
let a8 = normalizer.clip(a32);
d_row.get_unchecked_mut(dst_x).0 = [l8, a8];
}
#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn horiz_convolution_one_row(
src_row: &[U8x2],
dst_row: &mut [U8x2],
normalizer: &Normalizer16,
) {
let precision = normalizer.precision();
#[rustfmt::skip]
let pix_sh1 = _mm_set_epi8(
-1, 7, -1, 5, -1, 6, -1, 4, -1, 3, -1, 1, -1, 2, -1, 0,
);
#[rustfmt::skip]
let coeff_sh1 = _mm_set_epi8(
7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0,
);
#[rustfmt::skip]
let pix_sh2 = _mm_set_epi8(
-1, 15, -1, 13, -1, 14, -1, 12, -1, 11, -1, 9, -1, 10, -1, 8,
);
#[rustfmt::skip]
let coeff_sh2 = _mm_set_epi8(
15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8,
);
let pix_sh3 = _mm_set_epi8(-1, 7, -1, 5, -1, 6, -1, 4, -1, 3, -1, 1, -1, 2, -1, 0);
for (dst_x, chunk) in normalizer.chunks().iter().enumerate() {
let mut x = chunk.start as usize;
let mut coeffs = chunk.values();
let mut sss = _mm_set1_epi32(1 << (precision - 2));
let coeffs_by_8 = coeffs.chunks_exact(8);
coeffs = coeffs_by_8.remainder();
for k in coeffs_by_8 {
let ksource = simd_utils::loadu_si128(k, 0);
let source = simd_utils::loadu_si128(src_row, x);
let pix = _mm_shuffle_epi8(source, pix_sh1);
let mmk = _mm_shuffle_epi8(ksource, coeff_sh1);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
let pix = _mm_shuffle_epi8(source, pix_sh2);
let mmk = _mm_shuffle_epi8(ksource, coeff_sh2);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
x += 8;
}
let coeffs_by_4 = coeffs.chunks_exact(4);
let reminder1 = coeffs_by_4.remainder();
for k in coeffs_by_4 {
let mmk = _mm_set_epi16(k[3], k[2], k[3], k[2], k[1], k[0], k[1], k[0]);
let source = simd_utils::loadl_epi64(src_row, x);
let pix = _mm_shuffle_epi8(source, pix_sh3);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
x += 4
}
if !reminder1.is_empty() {
let mut pixels: [i16; 6] = [0; 6];
let mut coeffs: [i16; 3] = [0; 3];
for (i, &coeff) in reminder1.iter().enumerate() {
coeffs[i] = coeff;
let pixel: [u8; 2] = src_row.get_unchecked(x).0;
pixels[i * 2] = pixel[0] as i16;
pixels[i * 2 + 1] = pixel[1] as i16;
x += 1;
}
let pix = _mm_set_epi16(
0, pixels[5], 0, pixels[4], pixels[3], pixels[1], pixels[2], pixels[0],
);
let mmk = _mm_set_epi16(
0, coeffs[2], 0, coeffs[2], coeffs[1], coeffs[0], coeffs[1], coeffs[0],
);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
let lo = _mm_extract_epi64::<0>(sss);
let hi = _mm_extract_epi64::<1>(sss);
let a32 = ((lo >> 32) as i32).saturating_add((hi >> 32) as i32);
let l32 = ((lo & 0xffffffff) as i32).saturating_add((hi & 0xffffffff) as i32);
let a8 = normalizer.clip(a32);
let l8 = normalizer.clip(l32);
dst_row.get_unchecked_mut(dst_x).0 = [l8, a8];
}
}