use crate::filter_weights::FilterBounds;
use std::arch::aarch64::*;
#[inline]
#[target_feature(enable = "neon")]
fn convolve_4tap_16(
dst: &mut [u16; 16],
bounds: &FilterBounds,
src: &[u16],
src_stride: usize,
weights: int16x4_t,
initial_store: int32x4_t,
v_max_colors: uint16x8_t,
v_dx: usize,
) {
let mut store0 = initial_store;
let mut store1 = initial_store;
let mut store2 = initial_store;
let mut store3 = initial_store;
unsafe {
let src0 = src.get_unchecked((src_stride * bounds.start + v_dx)..);
let r0 = vreinterpretq_s16_u16(vld1q_u16(src0.as_ptr()));
let r1 = vreinterpretq_s16_u16(vld1q_u16(src0.get_unchecked(8..).as_ptr()));
store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(r0), weights);
store1 = vmlal_high_lane_s16::<0>(store1, r0, weights);
store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(r1), weights);
store3 = vmlal_high_lane_s16::<0>(store3, r1, weights);
let src1 = src.get_unchecked((src_stride * (bounds.start + 1) + v_dx)..);
let r0 = vreinterpretq_s16_u16(vld1q_u16(src1.as_ptr()));
let r1 = vreinterpretq_s16_u16(vld1q_u16(src1.get_unchecked(8..).as_ptr()));
store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(r0), weights);
store1 = vmlal_high_lane_s16::<1>(store1, r0, weights);
store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(r1), weights);
store3 = vmlal_high_lane_s16::<1>(store3, r1, weights);
let src2 = src.get_unchecked((src_stride * (bounds.start + 2) + v_dx)..);
let r0 = vreinterpretq_s16_u16(vld1q_u16(src2.as_ptr()));
let r1 = vreinterpretq_s16_u16(vld1q_u16(src2.get_unchecked(8..).as_ptr()));
store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(r0), weights);
store1 = vmlal_high_lane_s16::<2>(store1, r0, weights);
store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(r1), weights);
store3 = vmlal_high_lane_s16::<2>(store3, r1, weights);
let src3 = src.get_unchecked((src_stride * (bounds.start + 3) + v_dx)..);
let r0 = vreinterpretq_s16_u16(vld1q_u16(src3.as_ptr()));
let r1 = vreinterpretq_s16_u16(vld1q_u16(src3.get_unchecked(8..).as_ptr()));
store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(r0), weights);
store1 = vmlal_high_lane_s16::<3>(store1, r0, weights);
store2 = vmlal_lane_s16::<3>(store2, vget_low_s16(r1), weights);
store3 = vmlal_high_lane_s16::<3>(store3, r1, weights);
}
let s0 = vqshrun_n_s32::<{ crate::support::PRECISION }>(store0);
let s1 = vqshrun_n_s32::<{ crate::support::PRECISION }>(store1);
let s2 = vqshrun_n_s32::<{ crate::support::PRECISION }>(store2);
let s3 = vqshrun_n_s32::<{ crate::support::PRECISION }>(store3);
let item0 = vminq_u16(vcombine_u16(s0, s1), v_max_colors);
let item1 = vminq_u16(vcombine_u16(s2, s3), v_max_colors);
unsafe {
vst1q_u16(dst.as_mut_ptr(), item0);
vst1q_u16(dst.get_unchecked_mut(8..).as_mut_ptr(), item1);
}
}
#[inline]
#[target_feature(enable = "neon")]
fn convolve_4tap_8(
dst: &mut [u16; 8],
bounds: &FilterBounds,
src: &[u16],
src_stride: usize,
weights: int16x4_t,
initial_store: int32x4_t,
v_max_colors: uint16x8_t,
v_dx: usize,
) {
let mut store0 = initial_store;
let mut store1 = initial_store;
unsafe {
let src0 = src.get_unchecked((src_stride * bounds.start + v_dx)..);
let r = vreinterpretq_s16_u16(vld1q_u16(src0.as_ptr()));
store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(r), weights);
store1 = vmlal_high_lane_s16::<0>(store1, r, weights);
let src1 = src.get_unchecked((src_stride * (bounds.start + 1) + v_dx)..);
let r = vreinterpretq_s16_u16(vld1q_u16(src1.as_ptr()));
store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(r), weights);
store1 = vmlal_high_lane_s16::<1>(store1, r, weights);
let src2 = src.get_unchecked((src_stride * (bounds.start + 2) + v_dx)..);
let r = vreinterpretq_s16_u16(vld1q_u16(src2.as_ptr()));
store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(r), weights);
store1 = vmlal_high_lane_s16::<2>(store1, r, weights);
let src3 = src.get_unchecked((src_stride * (bounds.start + 3) + v_dx)..);
let r = vreinterpretq_s16_u16(vld1q_u16(src3.as_ptr()));
store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(r), weights);
store1 = vmlal_high_lane_s16::<3>(store1, r, weights);
}
let item = vminq_u16(
vcombine_u16(
vqshrun_n_s32::<{ crate::support::PRECISION }>(store0),
vqshrun_n_s32::<{ crate::support::PRECISION }>(store1),
),
v_max_colors,
);
unsafe {
vst1q_u16(dst.as_mut_ptr(), item);
}
}
#[inline]
#[target_feature(enable = "neon")]
fn convolve_4tap_4(
dst: &mut [u16; 4],
bounds: &FilterBounds,
src: &[u16],
src_stride: usize,
weights: int16x4_t,
initial_store: int32x4_t,
v_max_colors: uint16x8_t,
v_dx: usize,
) {
let mut store0 = initial_store;
unsafe {
let src0 = src.get_unchecked((src_stride * bounds.start + v_dx)..);
store0 = vmlal_lane_s16::<0>(
store0,
vreinterpret_s16_u16(vld1_u16(src0.as_ptr())),
weights,
);
let src1 = src.get_unchecked((src_stride * (bounds.start + 1) + v_dx)..);
store0 = vmlal_lane_s16::<1>(
store0,
vreinterpret_s16_u16(vld1_u16(src1.as_ptr())),
weights,
);
let src2 = src.get_unchecked((src_stride * (bounds.start + 2) + v_dx)..);
store0 = vmlal_lane_s16::<2>(
store0,
vreinterpret_s16_u16(vld1_u16(src2.as_ptr())),
weights,
);
let src3 = src.get_unchecked((src_stride * (bounds.start + 3) + v_dx)..);
store0 = vmlal_lane_s16::<3>(
store0,
vreinterpret_s16_u16(vld1_u16(src3.as_ptr())),
weights,
);
vst1_u16(
dst.as_mut_ptr(),
vmin_u16(vqshrun_n_s32::<15>(store0), vget_low_u16(v_max_colors)),
);
}
}
#[inline(never)]
#[target_feature(enable = "neon")]
fn convolve_chunks_16(
chunks: &mut [[u16; 16]],
bounds: &FilterBounds,
src: &[u16],
src_stride: usize,
weights: &[i16],
bit_depth: u32,
cx: usize,
) -> usize {
let max_colors = (1u32 << bit_depth) - 1;
let mut cx = cx;
let bounds_size = bounds.size;
const PRECISION: i32 = 15;
const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
let initial_store = vdupq_n_s32(ROUNDING_CONST);
let v_max_colors = vdupq_n_u16(max_colors as u16);
let v_px = cx;
if bounds_size == 4 {
let weights4 = unsafe { vld1_s16(weights.as_ptr()) };
for (x, dst) in chunks.iter_mut().enumerate() {
convolve_4tap_16(
dst,
bounds,
src,
src_stride,
weights4,
initial_store,
v_max_colors,
v_px + x * 16,
);
cx += 16;
}
return cx;
}
for (x, dst) in chunks.iter_mut().enumerate() {
let mut store0 = initial_store;
let mut store1 = initial_store;
let mut store2 = initial_store;
let mut store3 = initial_store;
let v_dx = v_px + x * 16;
let mut j = 0usize;
while j + 4 <= bounds_size {
let py = bounds.start + j;
let src_ptr = unsafe { src.get_unchecked((src_stride * py + v_dx)..) };
let weights = unsafe { vld1_s16(weights.get_unchecked(j..).as_ptr()) };
let item_row0 = unsafe { vreinterpretq_s16_u16(vld1q_u16(src_ptr.as_ptr())) };
let item_row1 =
unsafe { vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(8..).as_ptr())) };
store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), weights);
store1 = vmlal_high_lane_s16::<0>(store1, item_row0, weights);
store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), weights);
store3 = vmlal_high_lane_s16::<0>(store3, item_row1, weights);
let item_row0 = unsafe {
vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(src_stride..).as_ptr()))
};
let item_row1 = unsafe {
vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(src_stride + 8..).as_ptr()))
};
store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row0), weights);
store1 = vmlal_high_lane_s16::<1>(store1, item_row0, weights);
store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row1), weights);
store3 = vmlal_high_lane_s16::<1>(store3, item_row1, weights);
let item_row0 = unsafe {
vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(src_stride * 2..).as_ptr()))
};
let item_row1 = unsafe {
vreinterpretq_s16_u16(vld1q_u16(
src_ptr.get_unchecked(src_stride * 2 + 8..).as_ptr(),
))
};
store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row0), weights);
store1 = vmlal_high_lane_s16::<2>(store1, item_row0, weights);
store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row1), weights);
store3 = vmlal_high_lane_s16::<2>(store3, item_row1, weights);
let item_row0 = unsafe {
vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(src_stride * 3..).as_ptr()))
};
let item_row1 = unsafe {
vreinterpretq_s16_u16(vld1q_u16(
src_ptr.get_unchecked(src_stride * 3 + 8..).as_ptr(),
))
};
store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row0), weights);
store1 = vmlal_high_lane_s16::<3>(store1, item_row0, weights);
store2 = vmlal_lane_s16::<3>(store2, vget_low_s16(item_row1), weights);
store3 = vmlal_high_lane_s16::<3>(store3, item_row1, weights);
j += 4;
}
while j + 2 <= bounds_size {
let py = bounds.start + j;
let src_ptr = unsafe { src.get_unchecked((src_stride * py + v_dx)..) };
let weights = unsafe {
vreinterpret_s16_u32(vld1_lane_u32::<0>(
weights.get_unchecked(j..).as_ptr().cast(),
vdup_n_u32(0),
))
};
let item_row0 = unsafe { vreinterpretq_s16_u16(vld1q_u16(src_ptr.as_ptr())) };
let item_row1 =
unsafe { vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(8..).as_ptr())) };
store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), weights);
store1 = vmlal_high_lane_s16::<0>(store1, item_row0, weights);
store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), weights);
store3 = vmlal_high_lane_s16::<0>(store3, item_row1, weights);
let item_row0 = unsafe {
vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(src_stride..).as_ptr()))
};
let item_row1 = unsafe {
vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(src_stride + 8..).as_ptr()))
};
store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row0), weights);
store1 = vmlal_high_lane_s16::<1>(store1, item_row0, weights);
store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row1), weights);
store3 = vmlal_high_lane_s16::<1>(store3, item_row1, weights);
j += 2;
}
let weights = &weights[j..bounds_size];
let base_y = bounds.start + j;
for (j, &k_weight) in weights.iter().enumerate() {
let py = base_y + j;
let src_ptr = unsafe { src.get_unchecked((src_stride * py + v_dx)..) };
let v_weight = vdupq_n_s16(k_weight);
let item_row0 = unsafe { vreinterpretq_s16_u16(vld1q_u16(src_ptr.as_ptr())) };
let item_row1 =
unsafe { vreinterpretq_s16_u16(vld1q_u16(src_ptr.get_unchecked(8..).as_ptr())) };
store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight));
store1 = vmlal_high_s16(store1, item_row0, v_weight);
store2 = vmlal_s16(store2, vget_low_s16(item_row1), vget_low_s16(v_weight));
store3 = vmlal_high_s16(store3, item_row1, v_weight);
}
let store0 = vqshrun_n_s32::<PRECISION>(store0);
let store1 = vqshrun_n_s32::<PRECISION>(store1);
let store2 = vqshrun_n_s32::<PRECISION>(store2);
let store3 = vqshrun_n_s32::<PRECISION>(store3);
let item0 = vminq_u16(vcombine_u16(store0, store1), v_max_colors);
let item1 = vminq_u16(vcombine_u16(store2, store3), v_max_colors);
unsafe {
vst1q_u16(dst.as_mut_ptr(), item0);
vst1q_u16(dst.get_unchecked_mut(8..).as_mut_ptr(), item1);
}
cx += 16;
}
cx
}
#[inline(never)]
#[target_feature(enable = "neon")]
fn convolve_chunks_8(
chunks: &mut [[u16; 8]],
bounds: &FilterBounds,
src: &[u16],
src_stride: usize,
weights: &[i16],
bit_depth: u32,
cx: usize,
) -> usize {
let max_colors = (1u32 << bit_depth) - 1;
let mut cx = cx;
const PRECISION: i32 = 15;
const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
let initial_store = vdupq_n_s32(ROUNDING_CONST);
let v_max_colors = vdupq_n_u16(max_colors as u16);
let v_px = cx;
if bounds.size == 4 {
let weights4 = unsafe { vld1_s16(weights.as_ptr()) };
for (x, dst) in chunks.iter_mut().enumerate() {
convolve_4tap_8(
dst,
bounds,
src,
src_stride,
weights4,
initial_store,
v_max_colors,
v_px + x * 8,
);
cx += 8;
}
return cx;
}
for (x, dst) in chunks.iter_mut().enumerate() {
let mut store0 = initial_store;
let mut store1 = initial_store;
let v_dx = v_px + x * 8;
for (j, &k_weight) in weights.iter().enumerate() {
let py = bounds.start + j;
let src_ptr = unsafe { src.get_unchecked((src_stride * py + v_dx)..) };
let v_weight = vdupq_n_s16(k_weight);
let item_row = unsafe { vreinterpretq_s16_u16(vld1q_u16(src_ptr.as_ptr())) };
store0 = vmlal_s16(store0, vget_low_s16(item_row), vget_low_s16(v_weight));
store1 = vmlal_high_s16(store1, item_row, v_weight);
}
let item = vminq_u16(
vcombine_u16(
vqshrun_n_s32::<PRECISION>(store0),
vqshrun_n_s32::<PRECISION>(store1),
),
v_max_colors,
);
unsafe {
vst1q_u16(dst.as_mut_ptr(), item);
}
cx += 8;
}
cx
}
#[inline(never)]
#[target_feature(enable = "neon")]
fn convolve_chunks_4(
chunks: &mut [[u16; 4]],
bounds: &FilterBounds,
src: &[u16],
src_stride: usize,
weights: &[i16],
bit_depth: u32,
cx: usize,
) -> usize {
let max_colors = (1u32 << bit_depth) - 1;
let mut cx = cx;
const PRECISION: i32 = 15;
const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
let initial_store = vdupq_n_s32(ROUNDING_CONST);
let v_max_colors = vdupq_n_u16(max_colors as u16);
let v_px = cx;
if bounds.size == 4 {
let weights4 = unsafe { vld1_s16(weights.as_ptr()) };
for (x, dst) in chunks.iter_mut().enumerate() {
convolve_4tap_4(
dst,
bounds,
src,
src_stride,
weights4,
initial_store,
v_max_colors,
v_px + x * 4,
);
cx += 4;
}
return cx;
}
for (x, dst) in chunks.iter_mut().enumerate() {
let mut store0 = initial_store;
let v_dx = v_px + x * 4;
for (j, &k_weight) in weights.iter().enumerate() {
let py = bounds.start + j;
let src_ptr = unsafe { src.get_unchecked((src_stride * py + v_dx)..) };
let v_weight = vdup_n_s16(k_weight);
let item_row = unsafe { vreinterpret_s16_u16(vld1_u16(src_ptr.as_ptr())) };
store0 = vmlal_s16(store0, item_row, v_weight);
}
let u_store0 = vmin_u16(
vqshrun_n_s32::<PRECISION>(store0),
vget_low_u16(v_max_colors),
);
unsafe {
vst1_u16(dst.as_mut_ptr(), u_store0);
}
cx += 4;
}
cx
}
pub(crate) fn convolve_column_lb_u16(
_: usize,
bounds: &FilterBounds,
src: &[u16],
dst: &mut [u16],
src_stride: usize,
weights: &[i16],
bit_depth: u32,
) {
unsafe {
let max_colors = (1u32 << bit_depth) - 1;
let mut cx = 0usize;
let bounds_size = bounds.size;
let weights = &weights[..bounds_size];
const PRECISION: i32 = 15;
const ROUNDING: i32 = 1 << (PRECISION - 1);
cx = convolve_chunks_16(
dst.as_chunks_mut::<16>().0,
bounds,
src,
src_stride,
weights,
bit_depth,
cx,
);
let mut rem = dst.as_chunks_mut::<16>().1;
cx = convolve_chunks_8(
rem.as_chunks_mut::<8>().0,
bounds,
src,
src_stride,
weights,
bit_depth,
cx,
);
rem = rem.as_chunks_mut::<8>().1;
cx = convolve_chunks_4(
rem.as_chunks_mut::<4>().0,
bounds,
src,
src_stride,
weights,
bit_depth,
cx,
);
let tail4 = rem.as_chunks_mut::<4>().1;
for (x, dst) in tail4.iter_mut().enumerate() {
let mut store0 = ROUNDING;
let v_px = cx + x;
for (j, &k_weight) in weights.iter().take(bounds_size).enumerate() {
let py = bounds.start + j;
let offset = src_stride * py + v_px;
let src_ptr = *src.get_unchecked(offset);
store0 = store0.wrapping_add((src_ptr as i32).wrapping_mul(k_weight as i32));
}
*dst = (store0 >> PRECISION).max(0).min(max_colors as i32) as u16;
}
}
}