#![allow(deprecated)] #![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
#[cfg(target_arch = "aarch64")]
use archmage::{Arm64, SimdToken, arcane};
#[cfg(target_arch = "aarch64")]
use safe_unaligned_simd::aarch64 as safe_simd;
use crate::include::common::bitdepth::BitDepth;
use crate::include::common::bitdepth::DynPixel;
use crate::include::dav1d::headers::Rav1dFilterMode;
#[cfg(target_arch = "aarch64")]
use crate::include::dav1d::headers::Rav1dPixelLayoutSubSampled;
use crate::include::dav1d::picture::PicOffset;
use crate::src::ffi_safe::FFISafe;
use crate::src::internal::COMPINTER_LEN;
use crate::src::internal::SCRATCH_INTER_INTRA_BUF_LEN;
use crate::src::internal::SCRATCH_LAP_LEN;
#[cfg(target_arch = "aarch64")]
use crate::src::internal::SEG_MASK_LEN;
use crate::src::levels::Filter2d;
use crate::src::safe_simd::pixel_access::Flex;
use crate::src::strided::Strided as _;
use crate::src::tables::dav1d_mc_subpel_filters;
#[cfg(target_arch = "aarch64")]
#[arcane]
fn avg_8bpc_inner(
_token: Arm64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16],
tmp2: &[i16],
w: usize,
h: usize,
) {
let mut dst = dst.flex_mut();
let tmp1 = tmp1.flex();
let tmp2 = tmp2.flex();
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 16 <= w {
let t1_lo = safe_simd::vld1q_s16(tmp1_row[col..][..8].try_into().unwrap());
let t1_hi = safe_simd::vld1q_s16(tmp1_row[col + 8..][..8].try_into().unwrap());
let t2_lo = safe_simd::vld1q_s16(tmp2_row[col..][..8].try_into().unwrap());
let t2_hi = safe_simd::vld1q_s16(tmp2_row[col + 8..][..8].try_into().unwrap());
let sum_lo = vaddq_s16(t1_lo, t2_lo);
let sum_hi = vaddq_s16(t1_hi, t2_hi);
let avg_lo = vqrdmulhq_n_s16(sum_lo, 2048);
let avg_hi = vqrdmulhq_n_s16(sum_hi, 2048);
let packed_lo = vqmovun_s16(avg_lo);
let packed_hi = vqmovun_s16(avg_hi);
let result = vcombine_u8(packed_lo, packed_hi);
let dst_arr: &mut [u8; 16] = (&mut dst_row[col..col + 16]).try_into().unwrap();
safe_simd::vst1q_u8(dst_arr, result);
col += 16;
}
while col + 8 <= w {
let t1 = safe_simd::vld1q_s16(tmp1_row[col..][..8].try_into().unwrap());
let t2 = safe_simd::vld1q_s16(tmp2_row[col..][..8].try_into().unwrap());
let sum = vaddq_s16(t1, t2);
let avg = vqrdmulhq_n_s16(sum, 2048);
let packed = vqmovun_s16(avg);
let dst_arr: &mut [u8; 8] = (&mut dst_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, packed);
col += 8;
}
while col < w {
let sum = tmp1_row[col] as i32 + tmp2_row[col] as i32;
let avg = ((sum * 1024 + 16384) >> 15).clamp(0, 255) as u8;
dst_row[col] = avg;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn avg_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let (token, dst) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs());
(token, dst)
};
avg_8bpc_inner(
token,
dst,
dst_stride as usize,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn avg_16bpc_inner(
_token: Arm64,
dst: &mut [u16],
dst_stride: usize,
tmp1: &[i16],
tmp2: &[i16],
w: usize,
h: usize,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let tmp1 = tmp1.flex();
let tmp2 = tmp2.flex();
let intermediate_bits = 4;
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 8 <= w {
let t1 = safe_simd::vld1q_s16(tmp1_row[col..][..8].try_into().unwrap());
let t2 = safe_simd::vld1q_s16(tmp2_row[col..][..8].try_into().unwrap());
let t1_lo = vmovl_s16(vget_low_s16(t1));
let t1_hi = vmovl_s16(vget_high_s16(t1));
let t2_lo = vmovl_s16(vget_low_s16(t2));
let t2_hi = vmovl_s16(vget_high_s16(t2));
let sum_lo = vaddq_s32(t1_lo, t2_lo);
let sum_hi = vaddq_s32(t1_hi, t2_hi);
let rnd = vdupq_n_s32(1 << intermediate_bits);
let sum_lo_rnd = vaddq_s32(sum_lo, rnd);
let sum_hi_rnd = vaddq_s32(sum_hi, rnd);
let avg_lo = vshrq_n_s32::<5>(sum_lo_rnd);
let avg_hi = vshrq_n_s32::<5>(sum_hi_rnd);
let avg_narrow_lo = vqmovn_s32(avg_lo);
let avg_narrow_hi = vqmovn_s32(avg_hi);
let avg_16 = vcombine_s16(avg_narrow_lo, avg_narrow_hi);
let zero = vdupq_n_s16(0);
let max = vdupq_n_s16(bitdepth_max as i16);
let clamped = vmaxq_s16(vminq_s16(avg_16, max), zero);
let dst_arr: &mut [u16; 8] = (&mut dst_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1q_u16(dst_arr, vreinterpretq_u16_s16(clamped));
col += 8;
}
while col < w {
let sum = tmp1_row[col] as i32 + tmp2_row[col] as i32;
let rnd = 1 << intermediate_bits;
let avg = ((sum + rnd) >> (intermediate_bits + 1)).clamp(0, bitdepth_max);
dst_row[col] = avg as u16;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn avg_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let (token, dst) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16);
(token, dst)
};
avg_16bpc_inner(
token,
dst,
dst_stride_u16,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
bitdepth_max,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn w_avg_8bpc_inner(
_token: Arm64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16],
tmp2: &[i16],
w: usize,
h: usize,
weight: i32,
) {
let mut dst = dst.flex_mut();
let tmp1 = tmp1.flex();
let tmp2 = tmp2.flex();
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 8 <= w {
let t1 = safe_simd::vld1q_s16(tmp1_row[col..][..8].try_into().unwrap());
let t2 = safe_simd::vld1q_s16(tmp2_row[col..][..8].try_into().unwrap());
let diff = vsubq_s16(t1, t2);
let diff_lo = vmovl_s16(vget_low_s16(diff));
let diff_hi = vmovl_s16(vget_high_s16(diff));
let weight_vec = vdupq_n_s32(weight);
let weighted_lo = vmulq_s32(diff_lo, weight_vec);
let weighted_hi = vmulq_s32(diff_hi, weight_vec);
let rnd = vdupq_n_s32(8);
let shifted_lo = vshrq_n_s32::<4>(vaddq_s32(weighted_lo, rnd));
let shifted_hi = vshrq_n_s32::<4>(vaddq_s32(weighted_hi, rnd));
let shifted_16 = vcombine_s16(vmovn_s32(shifted_lo), vmovn_s32(shifted_hi));
let sum = vaddq_s16(shifted_16, t2);
let scaled = vqrdmulhq_n_s16(sum, 2048);
let packed = vqmovun_s16(scaled);
let dst_arr: &mut [u8; 8] = (&mut dst_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, packed);
col += 8;
}
while col < w {
let diff = tmp1_row[col] as i32 - tmp2_row[col] as i32;
let weighted = ((diff * weight + 8) >> 4) + tmp2_row[col] as i32;
let scaled = ((weighted * 1024 + 16384) >> 15).clamp(0, 255);
dst_row[col] = scaled as u8;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn w_avg_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let (token, dst) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs());
(token, dst)
};
w_avg_8bpc_inner(
token,
dst,
dst_stride as usize,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
weight,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn w_avg_16bpc_inner(
_token: Arm64,
dst: &mut [u16],
dst_stride: usize,
tmp1: &[i16],
tmp2: &[i16],
w: usize,
h: usize,
weight: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let tmp1 = tmp1.flex();
let tmp2 = tmp2.flex();
let intermediate_bits = 4;
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 4 <= w {
let t1_16 = safe_simd::vld1_s16(tmp1_row[col..][..4].try_into().unwrap());
let t2_16 = safe_simd::vld1_s16(tmp2_row[col..][..4].try_into().unwrap());
let t1 = vmovl_s16(t1_16);
let t2 = vmovl_s16(t2_16);
let diff = vsubq_s32(t1, t2);
let weight_vec = vdupq_n_s32(weight);
let weighted = vmulq_s32(diff, weight_vec);
let rnd = vdupq_n_s32(8);
let shifted = vshrq_n_s32::<4>(vaddq_s32(weighted, rnd));
let sum = vaddq_s32(shifted, t2);
let rnd2 = vdupq_n_s32(1 << intermediate_bits);
let result = vshrq_n_s32::<5>(vaddq_s32(sum, rnd2));
let zero = vdupq_n_s32(0);
let max = vdupq_n_s32(bitdepth_max);
let clamped = vmaxq_s32(vminq_s32(result, max), zero);
let narrow = vmovn_s32(clamped);
let dst_arr: &mut [u16; 4] = (&mut dst_row[col..col + 4]).try_into().unwrap();
safe_simd::vst1_u16(dst_arr, vreinterpret_u16_s16(narrow));
col += 4;
}
while col < w {
let diff = tmp1_row[col] as i32 - tmp2_row[col] as i32;
let weighted = ((diff * weight + 8) >> 4) + tmp2_row[col] as i32;
let rnd = 1 << intermediate_bits;
let result = ((weighted + rnd) >> (intermediate_bits + 1)).clamp(0, bitdepth_max);
dst_row[col] = result as u16;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn w_avg_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let (token, dst) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16);
(token, dst)
};
w_avg_16bpc_inner(
token,
dst,
dst_stride_u16,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
weight,
bitdepth_max,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn mask_8bpc_inner(
_token: Arm64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16],
tmp2: &[i16],
w: usize,
h: usize,
mask: &[u8],
) {
let mut dst = dst.flex_mut();
let tmp1 = tmp1.flex();
let tmp2 = tmp2.flex();
let mask = mask.flex();
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 8 <= w {
let t1 = safe_simd::vld1q_s16(tmp1_row[col..][..8].try_into().unwrap());
let t2 = safe_simd::vld1q_s16(tmp2_row[col..][..8].try_into().unwrap());
let m = safe_simd::vld1_u8(mask_row[col..][..8].try_into().unwrap());
let m16 = vreinterpretq_s16_u16(vmovl_u8(m));
let diff = vsubq_s16(t1, t2);
let diff_lo = vmovl_s16(vget_low_s16(diff));
let diff_hi = vmovl_s16(vget_high_s16(diff));
let m_lo = vmovl_s16(vget_low_s16(m16));
let m_hi = vmovl_s16(vget_high_s16(m16));
let weighted_lo = vmulq_s32(diff_lo, m_lo);
let weighted_hi = vmulq_s32(diff_hi, m_hi);
let rnd = vdupq_n_s32(32);
let shifted_lo = vshrq_n_s32::<6>(vaddq_s32(weighted_lo, rnd));
let shifted_hi = vshrq_n_s32::<6>(vaddq_s32(weighted_hi, rnd));
let shifted_16 = vcombine_s16(vmovn_s32(shifted_lo), vmovn_s32(shifted_hi));
let sum = vaddq_s16(shifted_16, t2);
let scaled = vqrdmulhq_n_s16(sum, 2048);
let packed = vqmovun_s16(scaled);
let dst_arr: &mut [u8; 8] = (&mut dst_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, packed);
col += 8;
}
while col < w {
let diff = tmp1_row[col] as i32 - tmp2_row[col] as i32;
let m = mask_row[col] as i32;
let weighted = ((diff * m + 32) >> 6) + tmp2_row[col] as i32;
let scaled = ((weighted * 1024 + 16384) >> 15).clamp(0, 255);
dst_row[col] = scaled as u8;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn mask_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let (token, dst, mask) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs());
let mask = std::slice::from_raw_parts(mask_ptr, w * h);
(token, dst, mask)
};
mask_8bpc_inner(
token,
dst,
dst_stride as usize,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
mask,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn mask_16bpc_inner(
_token: Arm64,
dst: &mut [u16],
dst_stride: usize,
tmp1: &[i16],
tmp2: &[i16],
w: usize,
h: usize,
mask: &[u8],
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let tmp1 = tmp1.flex();
let tmp2 = tmp2.flex();
let mask = mask.flex();
let intermediate_bits = 4;
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 4 <= w {
let t1_16 = safe_simd::vld1_s16(tmp1_row[col..][..4].try_into().unwrap());
let t2_16 = safe_simd::vld1_s16(tmp2_row[col..][..4].try_into().unwrap());
let t1 = vmovl_s16(t1_16);
let t2 = vmovl_s16(t2_16);
let m_bytes: [u8; 8] = [
mask_row[col],
mask_row[col + 1],
mask_row[col + 2],
mask_row[col + 3],
0,
0,
0,
0,
];
let m8 = safe_simd::vld1_u8(&m_bytes);
let m16 = vmovl_u8(m8);
let m32 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(m16)));
let diff = vsubq_s32(t1, t2);
let weighted = vmulq_s32(diff, m32);
let rnd = vdupq_n_s32(32);
let shifted = vshrq_n_s32::<6>(vaddq_s32(weighted, rnd));
let sum = vaddq_s32(shifted, t2);
let rnd2 = vdupq_n_s32(1 << intermediate_bits);
let result = vshrq_n_s32::<5>(vaddq_s32(sum, rnd2));
let zero = vdupq_n_s32(0);
let max = vdupq_n_s32(bitdepth_max);
let clamped = vmaxq_s32(vminq_s32(result, max), zero);
let narrow = vmovn_s32(clamped);
let dst_arr: &mut [u16; 4] = (&mut dst_row[col..col + 4]).try_into().unwrap();
safe_simd::vst1_u16(dst_arr, vreinterpret_u16_s16(narrow));
col += 4;
}
while col < w {
let diff = tmp1_row[col] as i32 - tmp2_row[col] as i32;
let m = mask_row[col] as i32;
let weighted = ((diff * m + 32) >> 6) + tmp2_row[col] as i32;
let rnd = 1 << intermediate_bits;
let result = ((weighted + rnd) >> (intermediate_bits + 1)).clamp(0, bitdepth_max);
dst_row[col] = result as u16;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn mask_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let (token, dst, mask) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16);
let mask = std::slice::from_raw_parts(mask_ptr, w * h);
(token, dst, mask)
};
mask_16bpc_inner(
token,
dst,
dst_stride_u16,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
mask,
bitdepth_max,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn blend_8bpc_inner(
_token: Arm64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[i16],
w: usize,
h: usize,
mask: &[u8],
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let mask = mask.flex();
for row in 0..h {
let tmp_row = &tmp[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 8 <= w {
let d = safe_simd::vld1_u8(dst_row[col..][..8].try_into().unwrap());
let d16 = vreinterpretq_s16_u16(vmovl_u8(d));
let t = safe_simd::vld1q_s16(tmp_row[col..][..8].try_into().unwrap());
let m = safe_simd::vld1_u8(mask_row[col..][..8].try_into().unwrap());
let m16 = vreinterpretq_s16_u16(vmovl_u8(m));
let d_scaled = vshlq_n_s16::<4>(d16);
let diff = vsubq_s16(t, d_scaled);
let diff_lo = vmovl_s16(vget_low_s16(diff));
let diff_hi = vmovl_s16(vget_high_s16(diff));
let m_lo = vmovl_s16(vget_low_s16(m16));
let m_hi = vmovl_s16(vget_high_s16(m16));
let weighted_lo = vmulq_s32(diff_lo, m_lo);
let weighted_hi = vmulq_s32(diff_hi, m_hi);
let rnd = vdupq_n_s32(32);
let shifted_lo = vshrq_n_s32::<6>(vaddq_s32(weighted_lo, rnd));
let shifted_hi = vshrq_n_s32::<6>(vaddq_s32(weighted_hi, rnd));
let shifted_16 = vcombine_s16(vmovn_s32(shifted_lo), vmovn_s32(shifted_hi));
let sum = vaddq_s16(shifted_16, d_scaled);
let result = vshrq_n_s16::<4>(vaddq_s16(sum, vdupq_n_s16(8)));
let packed = vqmovun_s16(result);
let dst_arr: &mut [u8; 8] = (&mut dst_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, packed);
col += 8;
}
while col < w {
let d = dst_row[col] as i32;
let t = tmp_row[col] as i32;
let m = mask_row[col] as i32;
let d_scaled = d << 4;
let diff = t - d_scaled;
let weighted = (diff * m + 32) >> 6;
let result = ((d_scaled + weighted + 8) >> 4).clamp(0, 255);
dst_row[col] = result as u8;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn blend_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_INTER_INTRA_BUF_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let (dst, tmp, mask) = unsafe {
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs());
let tmp = std::slice::from_raw_parts(tmp as *const u8, w * h);
let mask = std::slice::from_raw_parts(mask_ptr, w * h);
(dst, tmp, mask)
};
for row in 0..h {
let dst_row = &mut dst[row * dst_stride.unsigned_abs()..][..w];
let tmp_row = &tmp[row * w..][..w];
let mask_row = &mask[row * w..][..w];
for col in 0..w {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
let m = mask_row[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u8;
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn blend_16bpc_inner(
_token: Arm64,
dst: &mut [u16],
dst_stride: usize,
tmp: &[i16],
w: usize,
h: usize,
mask: &[u8],
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let mask = mask.flex();
for row in 0..h {
let tmp_row = &tmp[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 4 <= w {
let d_u16 = safe_simd::vld1_u16(dst_row[col..][..4].try_into().unwrap());
let d = vreinterpretq_s32_u32(vmovl_u16(d_u16));
let t_16 = safe_simd::vld1_s16(tmp_row[col..][..4].try_into().unwrap());
let t = vmovl_s16(t_16);
let m_bytes: [u8; 8] = [
mask_row[col],
mask_row[col + 1],
mask_row[col + 2],
mask_row[col + 3],
0,
0,
0,
0,
];
let m8 = safe_simd::vld1_u8(&m_bytes);
let m16 = vmovl_u8(m8);
let m = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(m16)));
let diff = vsubq_s32(t, d);
let weighted = vmulq_s32(diff, m);
let rnd = vdupq_n_s32(32);
let shifted = vshrq_n_s32::<6>(vaddq_s32(weighted, rnd));
let result = vaddq_s32(shifted, d);
let zero = vdupq_n_s32(0);
let max = vdupq_n_s32(65535);
let clamped = vmaxq_s32(vminq_s32(result, max), zero);
let narrow = vmovn_u32(vreinterpretq_u32_s32(clamped));
let dst_arr: &mut [u16; 4] = (&mut dst_row[col..col + 4]).try_into().unwrap();
safe_simd::vst1_u16(dst_arr, narrow);
col += 4;
}
while col < w {
let d = dst_row[col] as i32;
let t = tmp_row[col] as i32;
let m = mask_row[col] as i32;
let diff = t - d;
let weighted = (diff * m + 32) >> 6;
let result = (d + weighted).clamp(0, 65535);
dst_row[col] = result as u16;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn blend_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_INTER_INTRA_BUF_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let (dst, tmp, mask) = unsafe {
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16);
let tmp = std::slice::from_raw_parts(tmp as *const u16, w * h);
let mask = std::slice::from_raw_parts(mask_ptr, w * h);
(dst, tmp, mask)
};
for row in 0..h {
let dst_row = &mut dst[row * dst_stride_u16..][..w];
let tmp_row = &tmp[row * w..][..w];
let mask_row = &mask[row * w..][..w];
for col in 0..w {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
let m = mask_row[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u16;
}
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_arm_token_available() {
#[cfg(target_arch = "aarch64")]
{
use archmage::{Arm64, SimdToken};
assert!(Arm64::summon().is_some());
}
}
#[test]
#[cfg(target_arch = "aarch64")]
fn test_arm_token_permutations() {
use archmage::testing::{CompileTimePolicy, for_each_token_permutation};
use archmage::{Arm64, SimdToken};
let mut had_enabled = false;
let mut had_disabled = false;
let report = for_each_token_permutation(CompileTimePolicy::WarnStderr, |_perm| {
if Arm64::summon().is_some() {
had_enabled = true;
} else {
had_disabled = true;
}
});
eprintln!("ARM permutations: {}", report.permutations_run);
assert!(report.permutations_run >= 1);
assert!(had_enabled, "token was never enabled");
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn blend_v_8bpc_inner(
_token: Arm64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[i16],
w: usize,
h: usize,
obmc_masks: &[u8],
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let obmc_masks = obmc_masks.flex();
for row in 0..h {
let tmp_row = &tmp[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mask = obmc_masks[row];
let mut col = 0;
while col + 8 <= w {
let d = safe_simd::vld1_u8(dst_row[col..][..8].try_into().unwrap());
let d16 = vreinterpretq_s16_u16(vmovl_u8(d));
let t = safe_simd::vld1q_s16(tmp_row[col..][..8].try_into().unwrap());
let m16 = vdupq_n_s16(mask as i16);
let d_scaled = vshlq_n_s16::<4>(d16);
let diff = vsubq_s16(t, d_scaled);
let diff_lo = vmovl_s16(vget_low_s16(diff));
let diff_hi = vmovl_s16(vget_high_s16(diff));
let m_lo = vmovl_s16(vget_low_s16(m16));
let m_hi = vmovl_s16(vget_high_s16(m16));
let weighted_lo = vmulq_s32(diff_lo, m_lo);
let weighted_hi = vmulq_s32(diff_hi, m_hi);
let rnd = vdupq_n_s32(32);
let shifted_lo = vshrq_n_s32::<6>(vaddq_s32(weighted_lo, rnd));
let shifted_hi = vshrq_n_s32::<6>(vaddq_s32(weighted_hi, rnd));
let shifted_16 = vcombine_s16(vmovn_s32(shifted_lo), vmovn_s32(shifted_hi));
let sum = vaddq_s16(shifted_16, d_scaled);
let result = vshrq_n_s16::<4>(vaddq_s16(sum, vdupq_n_s16(8)));
let packed = vqmovun_s16(result);
let dst_arr: &mut [u8; 8] = (&mut dst_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, packed);
col += 8;
}
while col < w {
let d = dst_row[col] as i32;
let t = tmp_row[col] as i32;
let m = mask as i32;
let d_scaled = d << 4;
let diff = t - d_scaled;
let weighted = (diff * m + 32) >> 6;
let result = ((d_scaled + weighted + 8) >> 4).clamp(0, 255);
dst_row[col] = result as u8;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn blend_v_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
_dst: *const FFISafe<PicOffset>,
) {
use crate::src::tables::dav1d_obmc_masks;
let w = w as usize;
let h = h as usize;
let (dst, tmp) = unsafe {
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs());
let tmp = std::slice::from_raw_parts(tmp as *const u8, w * h);
(dst, tmp)
};
let mask = &dav1d_obmc_masks[w..];
let dst_w = w * 3 >> 2;
for row in 0..h {
let dst_row = &mut dst[row * dst_stride.unsigned_abs()..][..dst_w];
let tmp_row = &tmp[row * w..][..dst_w];
for col in 0..dst_w {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
let m = mask[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u8;
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn blend_v_16bpc_inner(
_token: Arm64,
dst: &mut [u16],
dst_stride: usize,
tmp: &[i16],
w: usize,
h: usize,
obmc_masks: &[u8],
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let obmc_masks = obmc_masks.flex();
for row in 0..h {
let tmp_row = &tmp[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mask = obmc_masks[row];
let mut col = 0;
while col + 4 <= w {
let d_u16 = safe_simd::vld1_u16(dst_row[col..][..4].try_into().unwrap());
let d = vreinterpretq_s32_u32(vmovl_u16(d_u16));
let t_16 = safe_simd::vld1_s16(tmp_row[col..][..4].try_into().unwrap());
let t = vmovl_s16(t_16);
let m = vdupq_n_s32(mask as i32);
let diff = vsubq_s32(t, d);
let weighted = vmulq_s32(diff, m);
let rnd = vdupq_n_s32(32);
let shifted = vshrq_n_s32::<6>(vaddq_s32(weighted, rnd));
let result = vaddq_s32(shifted, d);
let zero = vdupq_n_s32(0);
let max = vdupq_n_s32(bitdepth_max);
let clamped = vmaxq_s32(vminq_s32(result, max), zero);
let narrow = vmovn_u32(vreinterpretq_u32_s32(clamped));
let dst_arr: &mut [u16; 4] = (&mut dst_row[col..col + 4]).try_into().unwrap();
safe_simd::vst1_u16(dst_arr, narrow);
col += 4;
}
while col < w {
let d = dst_row[col] as i32;
let t = tmp_row[col] as i32;
let m = mask as i32;
let diff = t - d;
let weighted = (diff * m + 32) >> 6;
let result = (d + weighted).clamp(0, bitdepth_max);
dst_row[col] = result as u16;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn blend_v_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
_dst: *const FFISafe<PicOffset>,
) {
use crate::src::tables::dav1d_obmc_masks;
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let (dst, tmp) = unsafe {
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16);
let tmp = std::slice::from_raw_parts(tmp as *const u16, w * h);
(dst, tmp)
};
let mask = &dav1d_obmc_masks[w..];
let dst_w = w * 3 >> 2;
for row in 0..h {
let dst_row = &mut dst[row * dst_stride_u16..][..dst_w];
let tmp_row = &tmp[row * w..][..dst_w];
for col in 0..dst_w {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
let m = mask[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u16;
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn blend_h_8bpc_inner(
_token: Arm64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[i16],
w: usize,
h: usize,
obmc_masks: &[u8],
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let obmc_masks = obmc_masks.flex();
for row in 0..h {
let tmp_row = &tmp[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 8 <= w {
let d = safe_simd::vld1_u8(dst_row[col..][..8].try_into().unwrap());
let d16 = vreinterpretq_s16_u16(vmovl_u8(d));
let t = safe_simd::vld1q_s16(tmp_row[col..][..8].try_into().unwrap());
let m = safe_simd::vld1_u8(obmc_masks[col..][..8].try_into().unwrap());
let m16 = vreinterpretq_s16_u16(vmovl_u8(m));
let d_scaled = vshlq_n_s16::<4>(d16);
let diff = vsubq_s16(t, d_scaled);
let diff_lo = vmovl_s16(vget_low_s16(diff));
let diff_hi = vmovl_s16(vget_high_s16(diff));
let m_lo = vmovl_s16(vget_low_s16(m16));
let m_hi = vmovl_s16(vget_high_s16(m16));
let weighted_lo = vmulq_s32(diff_lo, m_lo);
let weighted_hi = vmulq_s32(diff_hi, m_hi);
let rnd = vdupq_n_s32(32);
let shifted_lo = vshrq_n_s32::<6>(vaddq_s32(weighted_lo, rnd));
let shifted_hi = vshrq_n_s32::<6>(vaddq_s32(weighted_hi, rnd));
let shifted_16 = vcombine_s16(vmovn_s32(shifted_lo), vmovn_s32(shifted_hi));
let sum = vaddq_s16(shifted_16, d_scaled);
let result = vshrq_n_s16::<4>(vaddq_s16(sum, vdupq_n_s16(8)));
let packed = vqmovun_s16(result);
let dst_arr: &mut [u8; 8] = (&mut dst_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, packed);
col += 8;
}
while col < w {
let d = dst_row[col] as i32;
let t = tmp_row[col] as i32;
let m = obmc_masks[col] as i32;
let d_scaled = d << 4;
let diff = t - d_scaled;
let weighted = (diff * m + 32) >> 6;
let result = ((d_scaled + weighted + 8) >> 4).clamp(0, 255);
dst_row[col] = result as u8;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn blend_h_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
_dst: *const FFISafe<PicOffset>,
) {
use crate::src::tables::dav1d_obmc_masks;
let w = w as usize;
let h = h as usize;
let mask = &dav1d_obmc_masks[h..];
let h_effective = h * 3 >> 2;
let (dst, tmp) = unsafe {
let dst = std::slice::from_raw_parts_mut(
dst_ptr as *mut u8,
h_effective * dst_stride.unsigned_abs(),
);
let tmp = std::slice::from_raw_parts(tmp as *const u8, w * h_effective);
(dst, tmp)
};
for row in 0..h_effective {
let dst_row = &mut dst[row * dst_stride.unsigned_abs()..][..w];
let tmp_row = &tmp[row * w..][..w];
let m = mask[row] as u32;
for col in 0..w {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u8;
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn blend_h_16bpc_inner(
_token: Arm64,
dst: &mut [u16],
dst_stride: usize,
tmp: &[i16],
w: usize,
h: usize,
obmc_masks: &[u8],
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let obmc_masks = obmc_masks.flex();
for row in 0..h {
let tmp_row = &tmp[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 4 <= w {
let d_u16 = safe_simd::vld1_u16(dst_row[col..][..4].try_into().unwrap());
let d = vreinterpretq_s32_u32(vmovl_u16(d_u16));
let t_16 = safe_simd::vld1_s16(tmp_row[col..][..4].try_into().unwrap());
let t = vmovl_s16(t_16);
let m_bytes: [u8; 8] = [
obmc_masks[col],
obmc_masks[col + 1],
obmc_masks[col + 2],
obmc_masks[col + 3],
0,
0,
0,
0,
];
let m8 = safe_simd::vld1_u8(&m_bytes);
let m16 = vmovl_u8(m8);
let m = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(m16)));
let diff = vsubq_s32(t, d);
let weighted = vmulq_s32(diff, m);
let rnd = vdupq_n_s32(32);
let shifted = vshrq_n_s32::<6>(vaddq_s32(weighted, rnd));
let result = vaddq_s32(shifted, d);
let zero = vdupq_n_s32(0);
let max = vdupq_n_s32(bitdepth_max);
let clamped = vmaxq_s32(vminq_s32(result, max), zero);
let narrow = vmovn_u32(vreinterpretq_u32_s32(clamped));
let dst_arr: &mut [u16; 4] = (&mut dst_row[col..col + 4]).try_into().unwrap();
safe_simd::vst1_u16(dst_arr, narrow);
col += 4;
}
while col < w {
let d = dst_row[col] as i32;
let t = tmp_row[col] as i32;
let m = obmc_masks[col] as i32;
let diff = t - d;
let weighted = (diff * m + 32) >> 6;
let result = (d + weighted).clamp(0, bitdepth_max);
dst_row[col] = result as u16;
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn blend_h_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
_dst: *const FFISafe<PicOffset>,
) {
use crate::src::tables::dav1d_obmc_masks;
let w = w as usize;
let h = h as usize;
let mask = &dav1d_obmc_masks[h..];
let h_effective = h * 3 >> 2;
let dst_stride_u16 = (dst_stride / 2) as usize;
let (dst, tmp) = unsafe {
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h_effective * dst_stride_u16);
let tmp = std::slice::from_raw_parts(tmp as *const u16, w * h_effective);
(dst, tmp)
};
for row in 0..h_effective {
let dst_row = &mut dst[row * dst_stride_u16..][..w];
let tmp_row = &tmp[row * w..][..w];
let m = mask[row] as u32;
for col in 0..w {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u16;
}
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn w_mask_8bpc_inner(
_token: Arm64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16],
tmp2: &[i16],
w: usize,
h: usize,
mask: &mut [u8],
sign: u8,
ss_hor: bool,
ss_ver: bool,
) {
let mut dst = dst.flex_mut();
let tmp1 = tmp1.flex();
let tmp2 = tmp2.flex();
let mut mask = mask.flex_mut();
let intermediate_bits = 4i32;
let sh = intermediate_bits + 6;
let rnd = (32 << intermediate_bits) + 8192 * 64; let mask_sh = 8 + intermediate_bits - 4; let mask_rnd = 1i32 << (mask_sh - 5);
let mask_w = if ss_hor { w >> 1 } else { w };
for y in 0..h {
let tmp1_row = &tmp1[y * w..][..w];
let tmp2_row = &tmp2[y * w..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
let mut mask_row = if ss_ver && (y & 1) != 0 {
None
} else {
let mask_y = if ss_ver { y >> 1 } else { y };
Some(&mut mask[mask_y * mask_w..][..mask_w])
};
let mut col = 0;
while col + 8 <= w {
let t1 = safe_simd::vld1q_s16(tmp1_row[col..][..8].try_into().unwrap());
let t2 = safe_simd::vld1q_s16(tmp2_row[col..][..8].try_into().unwrap());
let diff = vsubq_s16(t1, t2);
let abs_diff = vabsq_s16(diff);
let abs_32_lo = vmovl_s16(vget_low_s16(abs_diff));
let abs_32_hi = vmovl_s16(vget_high_s16(abs_diff));
let mask_rnd_vec = vdupq_n_s32(mask_rnd);
let m_lo = vaddq_s32(abs_32_lo, mask_rnd_vec);
let m_hi = vaddq_s32(abs_32_hi, mask_rnd_vec);
let m_shifted_lo = vshrq_n_s32::<8>(m_lo);
let m_shifted_hi = vshrq_n_s32::<8>(m_hi);
let m_lo = vaddq_s32(m_shifted_lo, vdupq_n_s32(38));
let m_hi = vaddq_s32(m_shifted_hi, vdupq_n_s32(38));
let m_lo = vminq_s32(m_lo, vdupq_n_s32(64));
let m_hi = vminq_s32(m_hi, vdupq_n_s32(64));
let m_16 = vcombine_s16(vmovn_s32(m_lo), vmovn_s32(m_hi));
let m_final = if sign != 0 {
vsubq_s16(vdupq_n_s16(64), m_16)
} else {
m_16
};
let inv_m = vsubq_s16(vdupq_n_s16(64), m_final);
let t1_lo = vmovl_s16(vget_low_s16(t1));
let t1_hi = vmovl_s16(vget_high_s16(t1));
let t2_lo = vmovl_s16(vget_low_s16(t2));
let t2_hi = vmovl_s16(vget_high_s16(t2));
let m_lo_32 = vmovl_s16(vget_low_s16(m_final));
let m_hi_32 = vmovl_s16(vget_high_s16(m_final));
let inv_m_lo_32 = vmovl_s16(vget_low_s16(inv_m));
let inv_m_hi_32 = vmovl_s16(vget_high_s16(inv_m));
let rnd_vec = vdupq_n_s32(rnd);
let blend_lo = vaddq_s32(
vaddq_s32(vmulq_s32(t1_lo, m_lo_32), vmulq_s32(t2_lo, inv_m_lo_32)),
rnd_vec,
);
let blend_hi = vaddq_s32(
vaddq_s32(vmulq_s32(t1_hi, m_hi_32), vmulq_s32(t2_hi, inv_m_hi_32)),
rnd_vec,
);
let result_lo = vshrq_n_s32::<10>(blend_lo);
let result_hi = vshrq_n_s32::<10>(blend_hi);
let zero = vdupq_n_s32(0);
let max_val = vdupq_n_s32(255);
let result_lo = vmaxq_s32(vminq_s32(result_lo, max_val), zero);
let result_hi = vmaxq_s32(vminq_s32(result_hi, max_val), zero);
let narrow_lo = vmovn_s32(result_lo);
let narrow_hi = vmovn_s32(result_hi);
let narrow_16 = vcombine_s16(narrow_lo, narrow_hi);
let result_u8 = vqmovun_s16(narrow_16);
let dst_arr: &mut [u8; 8] = (&mut dst_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, result_u8);
if let Some(ref mut mask_row) = mask_row {
if !ss_hor {
let m_narrow = vqmovun_s16(m_16);
let mask_arr: &mut [u8; 8] = (&mut mask_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(mask_arr, m_narrow);
} else {
let mask_idx = col >> 1;
let m0 = vgetq_lane_s16::<0>(m_16) as i32;
let m1 = vgetq_lane_s16::<1>(m_16) as i32;
mask_row[mask_idx] = ((m0 + m1 + 1) >> 1) as u8;
let m2 = vgetq_lane_s16::<2>(m_16) as i32;
let m3 = vgetq_lane_s16::<3>(m_16) as i32;
mask_row[mask_idx + 1] = ((m2 + m3 + 1) >> 1) as u8;
let m4 = vgetq_lane_s16::<4>(m_16) as i32;
let m5 = vgetq_lane_s16::<5>(m_16) as i32;
mask_row[mask_idx + 2] = ((m4 + m5 + 1) >> 1) as u8;
let m6 = vgetq_lane_s16::<6>(m_16) as i32;
let m7 = vgetq_lane_s16::<7>(m_16) as i32;
mask_row[mask_idx + 3] = ((m6 + m7 + 1) >> 1) as u8;
}
}
col += 8;
}
while col < w {
let t1 = tmp1_row[col] as i32;
let t2 = tmp2_row[col] as i32;
let diff = t1 - t2;
let abs_diff = diff.abs();
let mut m = 38 + ((abs_diff + mask_rnd) >> mask_sh);
m = m.min(64);
let m_final = if sign != 0 { 64 - m } else { m };
let inv_m = 64 - m_final;
let blend = (t1 * m_final + t2 * inv_m + rnd) >> sh;
dst_row[col] = blend.clamp(0, 255) as u8;
if let Some(ref mut mask_row) = mask_row {
if !ss_hor {
mask_row[col] = m as u8;
} else if (col & 1) == 0 {
let mask_idx = col >> 1;
if col + 1 < w {
let t1_next = tmp1_row[col + 1] as i32;
let t2_next = tmp2_row[col + 1] as i32;
let diff_next = (t1_next - t2_next).abs();
let m_next = (38 + ((diff_next + mask_rnd) >> mask_sh)).min(64);
mask_row[mask_idx] = ((m + m_next + 1) >> 1) as u8;
} else {
mask_row[mask_idx] = m as u8;
}
}
}
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn w_mask_444_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs())
};
let token = unsafe { Arm64::forge_token_dangerously() };
w_mask_8bpc_inner(
token,
dst,
dst_stride as usize,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
mask.as_mut_slice(),
sign as u8,
false,
false,
);
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn w_mask_422_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs())
};
let token = unsafe { Arm64::forge_token_dangerously() };
w_mask_8bpc_inner(
token,
dst,
dst_stride as usize,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
mask.as_mut_slice(),
sign as u8,
true,
false,
);
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn w_mask_420_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs())
};
let token = unsafe { Arm64::forge_token_dangerously() };
w_mask_8bpc_inner(
token,
dst,
dst_stride as usize,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
mask.as_mut_slice(),
sign as u8,
true,
true,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn put_bilin_8bpc_inner(
_token: Arm64,
dst: &mut [u8],
dst_stride: usize,
src: &[u8],
src_stride: usize,
w: usize,
h: usize,
mx: i32,
my: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
match (mx, my) {
(0, 0) => {
for y in 0..h {
let src_row = &src[y * src_stride..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
dst_row.copy_from_slice(src_row);
}
}
(0, _) => {
let my = my as i16;
let coeff0 = 16 - my;
let coeff1 = my;
for y in 0..h {
let src_row0 = &src[y * src_stride..][..w];
let src_row1 = &src[(y + 1) * src_stride..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
let mut x = 0;
while x + 8 <= w {
let r0 = safe_simd::vld1_u8(src_row0[x..][..8].try_into().unwrap());
let r1 = safe_simd::vld1_u8(src_row1[x..][..8].try_into().unwrap());
let r0_16 = vreinterpretq_s16_u16(vmovl_u8(r0));
let r1_16 = vreinterpretq_s16_u16(vmovl_u8(r1));
let c0 = vdupq_n_s16(coeff0);
let c1 = vdupq_n_s16(coeff1);
let mul0 = vmulq_s16(r0_16, c0);
let mul1 = vmulq_s16(r1_16, c1);
let sum = vaddq_s16(mul0, mul1);
let rnd = vdupq_n_s16(8);
let result = vshrq_n_s16::<4>(vaddq_s16(sum, rnd));
let packed = vqmovun_s16(result);
let dst_arr: &mut [u8; 8] = (&mut dst_row[x..x + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, packed);
x += 8;
}
while x < w {
let r0 = src_row0[x] as i32;
let r1 = src_row1[x] as i32;
let pixel = coeff0 as i32 * r0 + coeff1 as i32 * r1;
dst_row[x] = ((pixel + 8) >> 4).clamp(0, 255) as u8;
x += 1;
}
}
}
(_, 0) => {
let mx = mx as i16;
let coeff0 = 16 - mx;
let coeff1 = mx;
for y in 0..h {
let src_row = &src[y * src_stride..][..w + 1];
let dst_row = &mut dst[y * dst_stride..][..w];
let mut x = 0;
while x + 8 <= w {
let s0 = safe_simd::vld1_u8(src_row[x..][..8].try_into().unwrap());
let s1 = safe_simd::vld1_u8(src_row[x + 1..][..8].try_into().unwrap());
let s0_16 = vreinterpretq_s16_u16(vmovl_u8(s0));
let s1_16 = vreinterpretq_s16_u16(vmovl_u8(s1));
let c0 = vdupq_n_s16(coeff0);
let c1 = vdupq_n_s16(coeff1);
let mul0 = vmulq_s16(s0_16, c0);
let mul1 = vmulq_s16(s1_16, c1);
let sum = vaddq_s16(mul0, mul1);
let rnd = vdupq_n_s16(8);
let result = vshrq_n_s16::<4>(vaddq_s16(sum, rnd));
let packed = vqmovun_s16(result);
let dst_arr: &mut [u8; 8] = (&mut dst_row[x..x + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, packed);
x += 8;
}
while x < w {
let s0 = src_row[x] as i32;
let s1 = src_row[x + 1] as i32;
let pixel = coeff0 as i32 * s0 + coeff1 as i32 * s1;
dst_row[x] = ((pixel + 8) >> 4).clamp(0, 255) as u8;
x += 1;
}
}
}
(_, _) => {
let mx = mx as i16;
let my = my as i16;
let h_coeff0 = 16 - mx;
let h_coeff1 = mx;
let v_coeff0 = 16 - my;
let v_coeff1 = my;
let mid_stride = w + 16;
let mut mid = vec![0i16; mid_stride * (h + 1)];
for y in 0..h + 1 {
let src_row = &src[y * src_stride..];
let mid_row = &mut mid[y * mid_stride..][..w];
for x in 0..w {
let s0 = src_row[x] as i32;
let s1 = src_row[x + 1] as i32;
let pixel = h_coeff0 as i32 * s0 + h_coeff1 as i32 * s1;
mid_row[x] = pixel as i16;
}
}
for y in 0..h {
let mid_row0 = &mid[y * mid_stride..][..w];
let mid_row1 = &mid[(y + 1) * mid_stride..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
let mut x = 0;
while x + 8 <= w {
let r0 = safe_simd::vld1q_s16(mid_row0[x..][..8].try_into().unwrap());
let r1 = safe_simd::vld1q_s16(mid_row1[x..][..8].try_into().unwrap());
let r0_lo = vmovl_s16(vget_low_s16(r0));
let r0_hi = vmovl_s16(vget_high_s16(r0));
let r1_lo = vmovl_s16(vget_low_s16(r1));
let r1_hi = vmovl_s16(vget_high_s16(r1));
let c0 = vdupq_n_s32(v_coeff0 as i32);
let c1 = vdupq_n_s32(v_coeff1 as i32);
let sum_lo = vaddq_s32(vmulq_s32(r0_lo, c0), vmulq_s32(r1_lo, c1));
let sum_hi = vaddq_s32(vmulq_s32(r0_hi, c0), vmulq_s32(r1_hi, c1));
let rnd = vdupq_n_s32(128);
let result_lo = vshrq_n_s32::<8>(vaddq_s32(sum_lo, rnd));
let result_hi = vshrq_n_s32::<8>(vaddq_s32(sum_hi, rnd));
let zero = vdupq_n_s32(0);
let max_val = vdupq_n_s32(255);
let result_lo = vmaxq_s32(vminq_s32(result_lo, max_val), zero);
let result_hi = vmaxq_s32(vminq_s32(result_hi, max_val), zero);
let narrow_lo = vmovn_s32(result_lo);
let narrow_hi = vmovn_s32(result_hi);
let narrow_16 = vcombine_s16(narrow_lo, narrow_hi);
let result_u8 = vqmovun_s16(narrow_16);
let dst_arr: &mut [u8; 8] = (&mut dst_row[x..x + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, result_u8);
x += 8;
}
while x < w {
let r0 = mid_row0[x] as i32;
let r1 = mid_row1[x] as i32;
let pixel = v_coeff0 as i32 * r0 + v_coeff1 as i32 * r1;
dst_row[x] = ((pixel + 128) >> 8).clamp(0, 255) as u8;
x += 1;
}
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn put_bilin_8bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
_src: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let (token, src, dst) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let src = std::slice::from_raw_parts(
src_ptr as *const u8,
(h + 1) * src_stride.unsigned_abs() + w + 1,
);
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h * dst_stride.unsigned_abs());
(token, src, dst)
};
put_bilin_8bpc_inner(
token,
dst,
dst_stride as usize,
src,
src_stride as usize,
w,
h,
mx,
my,
);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn prep_bilin_8bpc_inner(
_token: Arm64,
tmp: &mut [i16],
src: &[u8],
src_stride: usize,
w: usize,
h: usize,
mx: i32,
my: i32,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
const PREP_BIAS: i16 = 8192;
match (mx, my) {
(0, 0) => {
for y in 0..h {
let src_row = &src[y * src_stride..][..w];
let tmp_row = &mut tmp[y * w..][..w];
let mut x = 0;
while x + 8 <= w {
let s = safe_simd::vld1_u8(src_row[x..][..8].try_into().unwrap());
let s16 = vreinterpretq_s16_u16(vmovl_u8(s));
let scaled = vshlq_n_s16::<4>(s16);
let biased = vsubq_s16(scaled, vdupq_n_s16(PREP_BIAS));
let tmp_arr: &mut [i16; 8] = (&mut tmp_row[x..x + 8]).try_into().unwrap();
safe_simd::vst1q_s16(tmp_arr, biased);
x += 8;
}
while x < w {
let pixel = src_row[x] as i16;
tmp_row[x] = (pixel << 4) - PREP_BIAS;
x += 1;
}
}
}
(0, _) => {
let my = my as i16;
let coeff0 = 16 - my;
let coeff1 = my;
for y in 0..h {
let src_row0 = &src[y * src_stride..][..w];
let src_row1 = &src[(y + 1) * src_stride..][..w];
let tmp_row = &mut tmp[y * w..][..w];
let mut x = 0;
while x + 8 <= w {
let r0 = safe_simd::vld1_u8(src_row0[x..][..8].try_into().unwrap());
let r1 = safe_simd::vld1_u8(src_row1[x..][..8].try_into().unwrap());
let r0_16 = vreinterpretq_s16_u16(vmovl_u8(r0));
let r1_16 = vreinterpretq_s16_u16(vmovl_u8(r1));
let c0 = vdupq_n_s16(coeff0);
let c1 = vdupq_n_s16(coeff1);
let mul0 = vmulq_s16(r0_16, c0);
let mul1 = vmulq_s16(r1_16, c1);
let sum = vaddq_s16(mul0, mul1);
let biased = vsubq_s16(sum, vdupq_n_s16(PREP_BIAS));
let tmp_arr: &mut [i16; 8] = (&mut tmp_row[x..x + 8]).try_into().unwrap();
safe_simd::vst1q_s16(tmp_arr, biased);
x += 8;
}
while x < w {
let r0 = src_row0[x] as i32;
let r1 = src_row1[x] as i32;
let pixel = coeff0 as i32 * r0 + coeff1 as i32 * r1;
tmp_row[x] = (pixel - PREP_BIAS as i32) as i16;
x += 1;
}
}
}
(_, 0) => {
let mx = mx as i16;
let coeff0 = 16 - mx;
let coeff1 = mx;
for y in 0..h {
let src_row = &src[y * src_stride..][..w + 1];
let tmp_row = &mut tmp[y * w..][..w];
let mut x = 0;
while x + 8 <= w {
let s0 = safe_simd::vld1_u8(src_row[x..][..8].try_into().unwrap());
let s1 = safe_simd::vld1_u8(src_row[x + 1..][..8].try_into().unwrap());
let s0_16 = vreinterpretq_s16_u16(vmovl_u8(s0));
let s1_16 = vreinterpretq_s16_u16(vmovl_u8(s1));
let c0 = vdupq_n_s16(coeff0);
let c1 = vdupq_n_s16(coeff1);
let mul0 = vmulq_s16(s0_16, c0);
let mul1 = vmulq_s16(s1_16, c1);
let sum = vaddq_s16(mul0, mul1);
let biased = vsubq_s16(sum, vdupq_n_s16(PREP_BIAS));
let tmp_arr: &mut [i16; 8] = (&mut tmp_row[x..x + 8]).try_into().unwrap();
safe_simd::vst1q_s16(tmp_arr, biased);
x += 8;
}
while x < w {
let s0 = src_row[x] as i32;
let s1 = src_row[x + 1] as i32;
let pixel = coeff0 as i32 * s0 + coeff1 as i32 * s1;
tmp_row[x] = (pixel - PREP_BIAS as i32) as i16;
x += 1;
}
}
}
(_, _) => {
let mx = mx as i16;
let my = my as i16;
let h_coeff0 = 16 - mx;
let h_coeff1 = mx;
let v_coeff0 = 16 - my;
let v_coeff1 = my;
let mid_stride = w + 16;
let mut mid = vec![0i16; mid_stride * (h + 1)];
for y in 0..h + 1 {
let src_row = &src[y * src_stride..];
let mid_row = &mut mid[y * mid_stride..][..w];
for x in 0..w {
let s0 = src_row[x] as i32;
let s1 = src_row[x + 1] as i32;
let pixel = h_coeff0 as i32 * s0 + h_coeff1 as i32 * s1;
mid_row[x] = pixel as i16;
}
}
for y in 0..h {
let mid_row0 = &mid[y * mid_stride..][..w];
let mid_row1 = &mid[(y + 1) * mid_stride..][..w];
let tmp_row = &mut tmp[y * w..][..w];
let mut x = 0;
while x + 8 <= w {
let r0 = safe_simd::vld1q_s16(mid_row0[x..][..8].try_into().unwrap());
let r1 = safe_simd::vld1q_s16(mid_row1[x..][..8].try_into().unwrap());
let r0_lo = vmovl_s16(vget_low_s16(r0));
let r0_hi = vmovl_s16(vget_high_s16(r0));
let r1_lo = vmovl_s16(vget_low_s16(r1));
let r1_hi = vmovl_s16(vget_high_s16(r1));
let c0 = vdupq_n_s32(v_coeff0 as i32);
let c1 = vdupq_n_s32(v_coeff1 as i32);
let sum_lo = vaddq_s32(vmulq_s32(r0_lo, c0), vmulq_s32(r1_lo, c1));
let sum_hi = vaddq_s32(vmulq_s32(r0_hi, c0), vmulq_s32(r1_hi, c1));
let result_lo = vshrq_n_s32::<4>(sum_lo);
let result_hi = vshrq_n_s32::<4>(sum_hi);
let narrow_lo = vmovn_s32(result_lo);
let narrow_hi = vmovn_s32(result_hi);
let narrow_16 = vcombine_s16(narrow_lo, narrow_hi);
let biased = vsubq_s16(narrow_16, vdupq_n_s16(PREP_BIAS));
let tmp_arr: &mut [i16; 8] = (&mut tmp_row[x..x + 8]).try_into().unwrap();
safe_simd::vst1q_s16(tmp_arr, biased);
x += 8;
}
while x < w {
let r0 = mid_row0[x] as i32;
let r1 = mid_row1[x] as i32;
let pixel = v_coeff0 as i32 * r0 + v_coeff1 as i32 * r1;
tmp_row[x] = ((pixel >> 4) - PREP_BIAS as i32) as i16;
x += 1;
}
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn prep_bilin_8bpc_neon(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_src: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let (token, src, tmp_slice) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let src = std::slice::from_raw_parts(
src_ptr as *const u8,
(h + 1) * src_stride.unsigned_abs() + w + 1,
);
let tmp_slice = std::slice::from_raw_parts_mut(tmp, h * w);
(token, src, tmp_slice)
};
prep_bilin_8bpc_inner(token, tmp_slice, src, src_stride as usize, w, h, mx, my);
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn put_bilin_16bpc_inner(
_token: Arm64,
dst: &mut [u16],
dst_stride: usize,
src: &[u16],
src_stride: usize,
w: usize,
h: usize,
mx: i32,
my: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
match (mx, my) {
(0, 0) => {
for y in 0..h {
let src_row = &src[y * src_stride..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
dst_row.copy_from_slice(src_row);
}
}
(0, _) => {
let coeff0 = 16 - my;
let coeff1 = my;
for y in 0..h {
let src_row0 = &src[y * src_stride..][..w];
let src_row1 = &src[(y + 1) * src_stride..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
for x in 0..w {
let r0 = src_row0[x] as i32;
let r1 = src_row1[x] as i32;
let pixel = coeff0 * r0 + coeff1 * r1;
dst_row[x] = ((pixel + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
}
}
(_, 0) => {
let coeff0 = 16 - mx;
let coeff1 = mx;
for y in 0..h {
let src_row = &src[y * src_stride..][..w + 1];
let dst_row = &mut dst[y * dst_stride..][..w];
for x in 0..w {
let s0 = src_row[x] as i32;
let s1 = src_row[x + 1] as i32;
let pixel = coeff0 * s0 + coeff1 * s1;
dst_row[x] = ((pixel + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
}
}
(_, _) => {
let h_coeff0 = 16 - mx;
let h_coeff1 = mx;
let v_coeff0 = 16 - my;
let v_coeff1 = my;
let mid_stride = w + 16;
let mut mid = vec![0i32; mid_stride * (h + 1)];
for y in 0..h + 1 {
let src_row = &src[y * src_stride..];
let mid_row = &mut mid[y * mid_stride..][..w];
for x in 0..w {
let s0 = src_row[x] as i32;
let s1 = src_row[x + 1] as i32;
mid_row[x] = h_coeff0 * s0 + h_coeff1 * s1;
}
}
for y in 0..h {
let mid_row0 = &mid[y * mid_stride..][..w];
let mid_row1 = &mid[(y + 1) * mid_stride..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
for x in 0..w {
let r0 = mid_row0[x];
let r1 = mid_row1[x];
let pixel = v_coeff0 * r0 + v_coeff1 * r1;
dst_row[x] = ((pixel + 128) >> 8).clamp(0, bitdepth_max) as u16;
}
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn put_bilin_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
_src: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let src_stride_u16 = (src_stride / 2) as usize;
let (token, dst, src) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16);
let src =
std::slice::from_raw_parts(src_ptr as *const u16, (h + 1) * src_stride_u16 + w + 1);
(token, dst, src)
};
put_bilin_16bpc_inner(
token,
dst,
dst_stride_u16,
src,
src_stride_u16,
w,
h,
mx,
my,
bitdepth_max,
);
}
const PREP_BIAS_16BPC: i32 = 8192;
#[cfg(target_arch = "aarch64")]
#[arcane]
fn prep_bilin_16bpc_inner(
_token: Arm64,
tmp: &mut [i16],
src: &[u16],
src_stride: usize,
w: usize,
h: usize,
mx: i32,
my: i32,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
match (mx, my) {
(0, 0) => {
for y in 0..h {
let src_row = &src[y * src_stride..][..w];
let tmp_row = &mut tmp[y * w..][..w];
for x in 0..w {
tmp_row[x] = (src_row[x] as i32 - PREP_BIAS_16BPC) as i16;
}
}
}
(0, _) => {
let coeff0 = 16 - my;
let coeff1 = my;
for y in 0..h {
let src_row0 = &src[y * src_stride..][..w];
let src_row1 = &src[(y + 1) * src_stride..][..w];
let tmp_row = &mut tmp[y * w..][..w];
for x in 0..w {
let r0 = src_row0[x] as i32;
let r1 = src_row1[x] as i32;
let pixel = coeff0 * r0 + coeff1 * r1;
tmp_row[x] = ((pixel >> 4) - PREP_BIAS_16BPC) as i16;
}
}
}
(_, 0) => {
let coeff0 = 16 - mx;
let coeff1 = mx;
for y in 0..h {
let src_row = &src[y * src_stride..][..w + 1];
let tmp_row = &mut tmp[y * w..][..w];
for x in 0..w {
let s0 = src_row[x] as i32;
let s1 = src_row[x + 1] as i32;
let pixel = coeff0 * s0 + coeff1 * s1;
tmp_row[x] = ((pixel >> 4) - PREP_BIAS_16BPC) as i16;
}
}
}
(_, _) => {
let h_coeff0 = 16 - mx;
let h_coeff1 = mx;
let v_coeff0 = 16 - my;
let v_coeff1 = my;
let mid_stride = w + 16;
let mut mid = vec![0i32; mid_stride * (h + 1)];
for y in 0..h + 1 {
let src_row = &src[y * src_stride..];
let mid_row = &mut mid[y * mid_stride..][..w];
for x in 0..w {
let s0 = src_row[x] as i32;
let s1 = src_row[x + 1] as i32;
mid_row[x] = h_coeff0 * s0 + h_coeff1 * s1;
}
}
for y in 0..h {
let mid_row0 = &mid[y * mid_stride..][..w];
let mid_row1 = &mid[(y + 1) * mid_stride..][..w];
let tmp_row = &mut tmp[y * w..][..w];
for x in 0..w {
let r0 = mid_row0[x];
let r1 = mid_row1[x];
let pixel = v_coeff0 * r0 + v_coeff1 * r1;
tmp_row[x] = ((pixel >> 8) - PREP_BIAS_16BPC) as i16;
}
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn prep_bilin_16bpc_neon(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_src: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let src_stride_u16 = (src_stride / 2) as usize;
let (token, src, tmp_slice) = unsafe {
let token = unsafe { Arm64::forge_token_dangerously() };
let src =
std::slice::from_raw_parts(src_ptr as *const u16, (h + 1) * src_stride_u16 + w + 1);
let tmp_slice = std::slice::from_raw_parts_mut(tmp, h * w);
(token, src, tmp_slice)
};
prep_bilin_16bpc_inner(token, tmp_slice, src, src_stride_u16, w, h, mx, my);
}
#[cfg(target_arch = "aarch64")]
fn w_mask_16bpc_inner(
dst: &mut [u16],
dst_stride: usize,
tmp1: &[i16],
tmp2: &[i16],
w: usize,
h: usize,
mask: &mut [u8],
sign: u8,
bitdepth_max: i32,
ss_hor: bool,
ss_ver: bool,
) {
let bitdepth = if bitdepth_max == 1023 { 10u32 } else { 12u32 };
let intermediate_bits = 4i32;
let sh = intermediate_bits + 6;
let rnd = (32i32 << intermediate_bits) + 8192 * 64;
let mask_sh = (bitdepth as i32 + intermediate_bits - 4) as u32;
let mask_rnd = 1u16 << (mask_sh - 5);
let mask_w = if ss_hor { w >> 1 } else { w };
for y in 0..h {
let tmp1_row = &tmp1[y * w..][..w];
let tmp2_row = &tmp2[y * w..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
let mut mask_row = if ss_ver && (y & 1) != 0 {
None
} else {
let mask_y = if ss_ver { y >> 1 } else { y };
Some(&mut mask[mask_y * mask_w..][..mask_w])
};
let mut col = 0;
while col < w {
let t1 = tmp1_row[col] as i32;
let t2 = tmp2_row[col] as i32;
let diff = t1.abs_diff(t2) as u16;
let m = std::cmp::min(38 + ((diff.saturating_add(mask_rnd)) >> mask_sh), 64) as u8;
let m_final = if sign != 0 { 64 - m } else { m };
let inv_m = 64 - m_final;
let pixel = (t1 * m_final as i32 + t2 * inv_m as i32 + rnd) >> sh;
dst_row[col] = pixel.clamp(0, bitdepth_max) as u16;
if let Some(ref mut mask_row) = mask_row {
if !ss_hor {
mask_row[col] = m;
} else if (col & 1) == 0 {
let mask_idx = col >> 1;
if col + 1 < w {
let t1_next = tmp1_row[col + 1] as i32;
let t2_next = tmp2_row[col + 1] as i32;
let diff_next = t1_next.abs_diff(t2_next) as u16;
let m_next = std::cmp::min(
38 + ((diff_next.saturating_add(mask_rnd)) >> mask_sh),
64,
) as u8;
mask_row[mask_idx] = ((m as u16 + m_next as u16 + 1) >> 1) as u8;
} else {
mask_row[mask_idx] = m;
}
}
}
col += 1;
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn w_mask_444_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let dst = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16) };
w_mask_16bpc_inner(
dst,
dst_stride_u16,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
mask.as_mut_slice(),
sign as u8,
bitdepth_max,
false,
false,
);
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn w_mask_422_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let dst = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16) };
w_mask_16bpc_inner(
dst,
dst_stride_u16,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
mask.as_mut_slice(),
sign as u8,
bitdepth_max,
true,
false,
);
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn w_mask_420_16bpc_neon(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let dst = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, h * dst_stride_u16) };
w_mask_16bpc_inner(
dst,
dst_stride_u16,
tmp1.as_slice(),
tmp2.as_slice(),
w,
h,
mask.as_mut_slice(),
sign as u8,
bitdepth_max,
true,
true,
);
}
const MID_STRIDE: usize = 128 + 16;
fn get_filter_coeff(m: usize, d: usize, filter_type: Rav1dFilterMode) -> Option<&'static [i8; 8]> {
let m = m.checked_sub(1)?;
let i = if d > 4 {
filter_type as u8
} else {
3 + (filter_type as u8 & 1)
};
Some(&dav1d_mc_subpel_filters[i as usize][m])
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn h_filter_8tap_8bpc_neon(
_token: Arm64,
dst: &mut [i16],
src: &[u8],
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let rnd = (1i16 << sh) >> 1;
let mut col = 0;
while col + 8 <= w {
let c0 = filter[0] as i16;
let c1 = filter[1] as i16;
let c2 = filter[2] as i16;
let c3 = filter[3] as i16;
let c4 = filter[4] as i16;
let c5 = filter[5] as i16;
let c6 = filter[6] as i16;
let c7 = filter[7] as i16;
let s0 = safe_simd::vld1_u8(src[col..][..8].try_into().unwrap());
let s1 = safe_simd::vld1_u8(src[col + 1..][..8].try_into().unwrap());
let s2 = safe_simd::vld1_u8(src[col + 2..][..8].try_into().unwrap());
let s3 = safe_simd::vld1_u8(src[col + 3..][..8].try_into().unwrap());
let s4 = safe_simd::vld1_u8(src[col + 4..][..8].try_into().unwrap());
let s5 = safe_simd::vld1_u8(src[col + 5..][..8].try_into().unwrap());
let s6 = safe_simd::vld1_u8(src[col + 6..][..8].try_into().unwrap());
let s7 = safe_simd::vld1_u8(src[col + 7..][..8].try_into().unwrap());
let s0_16 = vreinterpretq_s16_u16(vmovl_u8(s0));
let s1_16 = vreinterpretq_s16_u16(vmovl_u8(s1));
let s2_16 = vreinterpretq_s16_u16(vmovl_u8(s2));
let s3_16 = vreinterpretq_s16_u16(vmovl_u8(s3));
let s4_16 = vreinterpretq_s16_u16(vmovl_u8(s4));
let s5_16 = vreinterpretq_s16_u16(vmovl_u8(s5));
let s6_16 = vreinterpretq_s16_u16(vmovl_u8(s6));
let s7_16 = vreinterpretq_s16_u16(vmovl_u8(s7));
let mut sum = vmulq_n_s16(s0_16, c0);
sum = vmlaq_n_s16(sum, s1_16, c1);
sum = vmlaq_n_s16(sum, s2_16, c2);
sum = vmlaq_n_s16(sum, s3_16, c3);
sum = vmlaq_n_s16(sum, s4_16, c4);
sum = vmlaq_n_s16(sum, s5_16, c5);
sum = vmlaq_n_s16(sum, s6_16, c6);
sum = vmlaq_n_s16(sum, s7_16, c7);
let rnd_vec = vdupq_n_s16(rnd);
let result = vshrq_n_s16::<2>(vaddq_s16(sum, rnd_vec));
let dst_arr: &mut [i16; 8] = (&mut dst[col..col + 8]).try_into().unwrap();
safe_simd::vst1q_s16(dst_arr, result);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
dst[col] = ((sum + (rnd as i32)) >> sh) as i16;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn v_filter_8tap_8bpc_neon(
_token: Arm64,
dst: &mut [u8],
mid: &[[i16; MID_STRIDE]],
w: usize,
filter: &[i8; 8],
sh: u8,
max: u16,
) {
let mut dst = dst.flex_mut();
let rnd = (1i32 << sh) >> 1;
let _ = max;
let mut col = 0;
while col + 8 <= w {
let c0 = filter[0] as i32;
let c1 = filter[1] as i32;
let c2 = filter[2] as i32;
let c3 = filter[3] as i32;
let c4 = filter[4] as i32;
let c5 = filter[5] as i32;
let c6 = filter[6] as i32;
let c7 = filter[7] as i32;
let r0 = safe_simd::vld1q_s16(mid[0][col..][..8].try_into().unwrap());
let r1 = safe_simd::vld1q_s16(mid[1][col..][..8].try_into().unwrap());
let r2 = safe_simd::vld1q_s16(mid[2][col..][..8].try_into().unwrap());
let r3 = safe_simd::vld1q_s16(mid[3][col..][..8].try_into().unwrap());
let r4 = safe_simd::vld1q_s16(mid[4][col..][..8].try_into().unwrap());
let r5 = safe_simd::vld1q_s16(mid[5][col..][..8].try_into().unwrap());
let r6 = safe_simd::vld1q_s16(mid[6][col..][..8].try_into().unwrap());
let r7 = safe_simd::vld1q_s16(mid[7][col..][..8].try_into().unwrap());
let r0_lo = vmovl_s16(vget_low_s16(r0));
let r0_hi = vmovl_s16(vget_high_s16(r0));
let r1_lo = vmovl_s16(vget_low_s16(r1));
let r1_hi = vmovl_s16(vget_high_s16(r1));
let r2_lo = vmovl_s16(vget_low_s16(r2));
let r2_hi = vmovl_s16(vget_high_s16(r2));
let r3_lo = vmovl_s16(vget_low_s16(r3));
let r3_hi = vmovl_s16(vget_high_s16(r3));
let r4_lo = vmovl_s16(vget_low_s16(r4));
let r4_hi = vmovl_s16(vget_high_s16(r4));
let r5_lo = vmovl_s16(vget_low_s16(r5));
let r5_hi = vmovl_s16(vget_high_s16(r5));
let r6_lo = vmovl_s16(vget_low_s16(r6));
let r6_hi = vmovl_s16(vget_high_s16(r6));
let r7_lo = vmovl_s16(vget_low_s16(r7));
let r7_hi = vmovl_s16(vget_high_s16(r7));
let mut sum_lo = vmulq_n_s32(r0_lo, c0);
sum_lo = vmlaq_n_s32(sum_lo, r1_lo, c1);
sum_lo = vmlaq_n_s32(sum_lo, r2_lo, c2);
sum_lo = vmlaq_n_s32(sum_lo, r3_lo, c3);
sum_lo = vmlaq_n_s32(sum_lo, r4_lo, c4);
sum_lo = vmlaq_n_s32(sum_lo, r5_lo, c5);
sum_lo = vmlaq_n_s32(sum_lo, r6_lo, c6);
sum_lo = vmlaq_n_s32(sum_lo, r7_lo, c7);
let mut sum_hi = vmulq_n_s32(r0_hi, c0);
sum_hi = vmlaq_n_s32(sum_hi, r1_hi, c1);
sum_hi = vmlaq_n_s32(sum_hi, r2_hi, c2);
sum_hi = vmlaq_n_s32(sum_hi, r3_hi, c3);
sum_hi = vmlaq_n_s32(sum_hi, r4_hi, c4);
sum_hi = vmlaq_n_s32(sum_hi, r5_hi, c5);
sum_hi = vmlaq_n_s32(sum_hi, r6_hi, c6);
sum_hi = vmlaq_n_s32(sum_hi, r7_hi, c7);
let rnd_vec = vdupq_n_s32(rnd);
sum_lo = vaddq_s32(sum_lo, rnd_vec);
sum_hi = vaddq_s32(sum_hi, rnd_vec);
let result_lo = vshrq_n_s32::<10>(sum_lo);
let result_hi = vshrq_n_s32::<10>(sum_hi);
let result_16 = vcombine_s16(vqmovn_s32(result_lo), vqmovn_s32(result_hi));
let result_8 = vqmovun_s16(result_16);
let dst_arr: &mut [u8; 8] = (&mut dst[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, result_8);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[i][col] as i32;
}
dst[col] = ((sum + rnd) >> sh).clamp(0, 255) as u8;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn h_filter_8tap_8bpc_put_neon(
_token: Arm64,
dst: &mut [u8],
src: &[u8],
w: usize,
filter: &[i8; 8],
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let mut col = 0;
while col + 8 <= w {
let c0 = filter[0] as i16;
let c1 = filter[1] as i16;
let c2 = filter[2] as i16;
let c3 = filter[3] as i16;
let c4 = filter[4] as i16;
let c5 = filter[5] as i16;
let c6 = filter[6] as i16;
let c7 = filter[7] as i16;
let s0 = safe_simd::vld1_u8(src[col..][..8].try_into().unwrap());
let s1 = safe_simd::vld1_u8(src[col + 1..][..8].try_into().unwrap());
let s2 = safe_simd::vld1_u8(src[col + 2..][..8].try_into().unwrap());
let s3 = safe_simd::vld1_u8(src[col + 3..][..8].try_into().unwrap());
let s4 = safe_simd::vld1_u8(src[col + 4..][..8].try_into().unwrap());
let s5 = safe_simd::vld1_u8(src[col + 5..][..8].try_into().unwrap());
let s6 = safe_simd::vld1_u8(src[col + 6..][..8].try_into().unwrap());
let s7 = safe_simd::vld1_u8(src[col + 7..][..8].try_into().unwrap());
let s0_16 = vreinterpretq_s16_u16(vmovl_u8(s0));
let s1_16 = vreinterpretq_s16_u16(vmovl_u8(s1));
let s2_16 = vreinterpretq_s16_u16(vmovl_u8(s2));
let s3_16 = vreinterpretq_s16_u16(vmovl_u8(s3));
let s4_16 = vreinterpretq_s16_u16(vmovl_u8(s4));
let s5_16 = vreinterpretq_s16_u16(vmovl_u8(s5));
let s6_16 = vreinterpretq_s16_u16(vmovl_u8(s6));
let s7_16 = vreinterpretq_s16_u16(vmovl_u8(s7));
let mut sum = vmulq_n_s16(s0_16, c0);
sum = vmlaq_n_s16(sum, s1_16, c1);
sum = vmlaq_n_s16(sum, s2_16, c2);
sum = vmlaq_n_s16(sum, s3_16, c3);
sum = vmlaq_n_s16(sum, s4_16, c4);
sum = vmlaq_n_s16(sum, s5_16, c5);
sum = vmlaq_n_s16(sum, s6_16, c6);
sum = vmlaq_n_s16(sum, s7_16, c7);
let rnd_vec = vdupq_n_s16(32);
let result = vshrq_n_s16::<6>(vaddq_s16(sum, rnd_vec));
let result_8 = vqmovun_s16(result);
let dst_arr: &mut [u8; 8] = (&mut dst[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, result_8);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
dst[col] = ((sum + 32) >> 6).clamp(0, 255) as u8;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn v_filter_8tap_8bpc_direct_neon(
_token: Arm64,
dst: &mut [u8],
src: &[u8],
src_stride: usize,
w: usize,
filter: &[i8; 8],
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let mut col = 0;
while col + 8 <= w {
let c0 = filter[0] as i32;
let c1 = filter[1] as i32;
let c2 = filter[2] as i32;
let c3 = filter[3] as i32;
let c4 = filter[4] as i32;
let c5 = filter[5] as i32;
let c6 = filter[6] as i32;
let c7 = filter[7] as i32;
let r0 = safe_simd::vld1_u8(src[col..][..8].try_into().unwrap());
let r1 = safe_simd::vld1_u8(src[col + src_stride..][..8].try_into().unwrap());
let r2 = safe_simd::vld1_u8(src[col + 2 * src_stride..][..8].try_into().unwrap());
let r3 = safe_simd::vld1_u8(src[col + 3 * src_stride..][..8].try_into().unwrap());
let r4 = safe_simd::vld1_u8(src[col + 4 * src_stride..][..8].try_into().unwrap());
let r5 = safe_simd::vld1_u8(src[col + 5 * src_stride..][..8].try_into().unwrap());
let r6 = safe_simd::vld1_u8(src[col + 6 * src_stride..][..8].try_into().unwrap());
let r7 = safe_simd::vld1_u8(src[col + 7 * src_stride..][..8].try_into().unwrap());
let r0_16 = vreinterpretq_s16_u16(vmovl_u8(r0));
let r1_16 = vreinterpretq_s16_u16(vmovl_u8(r1));
let r2_16 = vreinterpretq_s16_u16(vmovl_u8(r2));
let r3_16 = vreinterpretq_s16_u16(vmovl_u8(r3));
let r4_16 = vreinterpretq_s16_u16(vmovl_u8(r4));
let r5_16 = vreinterpretq_s16_u16(vmovl_u8(r5));
let r6_16 = vreinterpretq_s16_u16(vmovl_u8(r6));
let r7_16 = vreinterpretq_s16_u16(vmovl_u8(r7));
let r0_lo = vmovl_s16(vget_low_s16(r0_16));
let r1_lo = vmovl_s16(vget_low_s16(r1_16));
let r2_lo = vmovl_s16(vget_low_s16(r2_16));
let r3_lo = vmovl_s16(vget_low_s16(r3_16));
let r4_lo = vmovl_s16(vget_low_s16(r4_16));
let r5_lo = vmovl_s16(vget_low_s16(r5_16));
let r6_lo = vmovl_s16(vget_low_s16(r6_16));
let r7_lo = vmovl_s16(vget_low_s16(r7_16));
let r0_hi = vmovl_s16(vget_high_s16(r0_16));
let r1_hi = vmovl_s16(vget_high_s16(r1_16));
let r2_hi = vmovl_s16(vget_high_s16(r2_16));
let r3_hi = vmovl_s16(vget_high_s16(r3_16));
let r4_hi = vmovl_s16(vget_high_s16(r4_16));
let r5_hi = vmovl_s16(vget_high_s16(r5_16));
let r6_hi = vmovl_s16(vget_high_s16(r6_16));
let r7_hi = vmovl_s16(vget_high_s16(r7_16));
let mut sum_lo = vmulq_n_s32(r0_lo, c0);
sum_lo = vmlaq_n_s32(sum_lo, r1_lo, c1);
sum_lo = vmlaq_n_s32(sum_lo, r2_lo, c2);
sum_lo = vmlaq_n_s32(sum_lo, r3_lo, c3);
sum_lo = vmlaq_n_s32(sum_lo, r4_lo, c4);
sum_lo = vmlaq_n_s32(sum_lo, r5_lo, c5);
sum_lo = vmlaq_n_s32(sum_lo, r6_lo, c6);
sum_lo = vmlaq_n_s32(sum_lo, r7_lo, c7);
let mut sum_hi = vmulq_n_s32(r0_hi, c0);
sum_hi = vmlaq_n_s32(sum_hi, r1_hi, c1);
sum_hi = vmlaq_n_s32(sum_hi, r2_hi, c2);
sum_hi = vmlaq_n_s32(sum_hi, r3_hi, c3);
sum_hi = vmlaq_n_s32(sum_hi, r4_hi, c4);
sum_hi = vmlaq_n_s32(sum_hi, r5_hi, c5);
sum_hi = vmlaq_n_s32(sum_hi, r6_hi, c6);
sum_hi = vmlaq_n_s32(sum_hi, r7_hi, c7);
let rnd_vec = vdupq_n_s32(32);
sum_lo = vshrq_n_s32::<6>(vaddq_s32(sum_lo, rnd_vec));
sum_hi = vshrq_n_s32::<6>(vaddq_s32(sum_hi, rnd_vec));
let result_16 = vcombine_s16(vqmovn_s32(sum_lo), vqmovn_s32(sum_hi));
let result_8 = vqmovun_s16(result_16);
let dst_arr: &mut [u8; 8] = (&mut dst[col..col + 8]).try_into().unwrap();
safe_simd::vst1_u8(dst_arr, result_8);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i * src_stride] as i32;
}
dst[col] = ((sum + 32) >> 6).clamp(0, 255) as u8;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn put_8tap_8bpc_inner(
token: Arm64,
dst: &mut [u8],
dst_stride: usize,
src: &[u8],
src_stride: usize,
w: usize,
h: usize,
mx: usize,
my: usize,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let intermediate_bits = 4u8;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = [[0i16; MID_STRIDE]; 135];
for y in 0..tmp_h {
let src_offset = if y >= 3 {
(y - 3) * src_stride
} else {
0usize.wrapping_sub((3 - y) * src_stride)
};
let src_row = &src[src_offset..];
h_filter_8tap_8bpc_neon(
token,
&mut mid[y][..w],
src_row,
w,
fh,
6 - intermediate_bits,
);
}
for y in 0..h {
let dst_row = &mut dst[y * dst_stride..][..w];
v_filter_8tap_8bpc_neon(
token,
dst_row,
&mid[y..],
w,
fv,
6 + intermediate_bits,
255,
);
}
}
(Some(fh), None) => {
for y in 0..h {
let src_row = &src[y * src_stride..];
let dst_row = &mut dst[y * dst_stride..][..w];
h_filter_8tap_8bpc_put_neon(token, dst_row, src_row, w, fh);
}
}
(None, Some(fv)) => {
for y in 0..h {
let src_offset = if y >= 3 {
(y - 3) * src_stride
} else {
0usize.wrapping_sub((3 - y) * src_stride)
};
let src_row = &src[src_offset..];
let dst_row = &mut dst[y * dst_stride..][..w];
v_filter_8tap_8bpc_direct_neon(token, dst_row, src_row, src_stride, w, fv);
}
}
(None, None) => {
for y in 0..h {
let src_row = &src[y * src_stride..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
dst_row.copy_from_slice(src_row);
}
}
}
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn get_h_filter_type(filter: Filter2d) -> Rav1dFilterMode {
match filter {
Filter2d::Regular8Tap | Filter2d::RegularSmooth8Tap | Filter2d::RegularSharp8Tap => {
Rav1dFilterMode::Regular8Tap
}
Filter2d::Smooth8Tap | Filter2d::SmoothRegular8Tap | Filter2d::SmoothSharp8Tap => {
Rav1dFilterMode::Smooth8Tap
}
Filter2d::Sharp8Tap | Filter2d::SharpRegular8Tap | Filter2d::SharpSmooth8Tap => {
Rav1dFilterMode::Sharp8Tap
}
Filter2d::Bilinear => Rav1dFilterMode::Regular8Tap, }
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn get_v_filter_type(filter: Filter2d) -> Rav1dFilterMode {
match filter {
Filter2d::Regular8Tap | Filter2d::SmoothRegular8Tap | Filter2d::SharpRegular8Tap => {
Rav1dFilterMode::Regular8Tap
}
Filter2d::Smooth8Tap | Filter2d::RegularSmooth8Tap | Filter2d::SharpSmooth8Tap => {
Rav1dFilterMode::Smooth8Tap
}
Filter2d::Sharp8Tap | Filter2d::RegularSharp8Tap | Filter2d::SmoothSharp8Tap => {
Rav1dFilterMode::Sharp8Tap
}
Filter2d::Bilinear => Rav1dFilterMode::Regular8Tap, }
}
macro_rules! define_put_8tap_8bpc {
($name:ident, $filter:expr) => {
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn $name(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
_src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Arm64::forge_token_dangerously() };
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let dst_stride_u = dst_stride as usize;
let src_stride_u = src_stride as usize;
let src_base = (src_ptr as *const u8).offset(-3 * src_stride - 3);
let src_len = (h + 7) * src_stride_u + w + 7;
let src = std::slice::from_raw_parts(src_base, src_len);
let dst_len = h * dst_stride_u + w;
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u8, dst_len);
let src_adjusted = &src[3 * src_stride_u + 3..];
put_8tap_8bpc_inner(
token,
dst,
dst_stride_u,
src_adjusted,
src_stride_u,
w,
h,
mx,
my,
get_h_filter_type($filter),
get_v_filter_type($filter),
);
}
};
}
define_put_8tap_8bpc!(put_8tap_regular_8bpc_neon, Filter2d::Regular8Tap);
define_put_8tap_8bpc!(
put_8tap_regular_smooth_8bpc_neon,
Filter2d::RegularSmooth8Tap
);
define_put_8tap_8bpc!(put_8tap_regular_sharp_8bpc_neon, Filter2d::RegularSharp8Tap);
define_put_8tap_8bpc!(
put_8tap_smooth_regular_8bpc_neon,
Filter2d::SmoothRegular8Tap
);
define_put_8tap_8bpc!(put_8tap_smooth_8bpc_neon, Filter2d::Smooth8Tap);
define_put_8tap_8bpc!(put_8tap_smooth_sharp_8bpc_neon, Filter2d::SmoothSharp8Tap);
define_put_8tap_8bpc!(put_8tap_sharp_regular_8bpc_neon, Filter2d::SharpRegular8Tap);
define_put_8tap_8bpc!(put_8tap_sharp_smooth_8bpc_neon, Filter2d::SharpSmooth8Tap);
define_put_8tap_8bpc!(put_8tap_sharp_8bpc_neon, Filter2d::Sharp8Tap);
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn v_filter_8tap_to_i16_neon(
_token: Arm64,
dst: &mut [i16],
mid: &[[i16; MID_STRIDE]],
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let mut dst = dst.flex_mut();
let rnd = (1i32 << sh) >> 1;
let mut col = 0;
while col + 4 <= w {
let c0 = filter[0] as i32;
let c1 = filter[1] as i32;
let c2 = filter[2] as i32;
let c3 = filter[3] as i32;
let c4 = filter[4] as i32;
let c5 = filter[5] as i32;
let c6 = filter[6] as i32;
let c7 = filter[7] as i32;
let r0 = safe_simd::vld1_s16(mid[0][col..][..4].try_into().unwrap());
let r1 = safe_simd::vld1_s16(mid[1][col..][..4].try_into().unwrap());
let r2 = safe_simd::vld1_s16(mid[2][col..][..4].try_into().unwrap());
let r3 = safe_simd::vld1_s16(mid[3][col..][..4].try_into().unwrap());
let r4 = safe_simd::vld1_s16(mid[4][col..][..4].try_into().unwrap());
let r5 = safe_simd::vld1_s16(mid[5][col..][..4].try_into().unwrap());
let r6 = safe_simd::vld1_s16(mid[6][col..][..4].try_into().unwrap());
let r7 = safe_simd::vld1_s16(mid[7][col..][..4].try_into().unwrap());
let r0_32 = vmovl_s16(r0);
let r1_32 = vmovl_s16(r1);
let r2_32 = vmovl_s16(r2);
let r3_32 = vmovl_s16(r3);
let r4_32 = vmovl_s16(r4);
let r5_32 = vmovl_s16(r5);
let r6_32 = vmovl_s16(r6);
let r7_32 = vmovl_s16(r7);
let mut sum = vmulq_n_s32(r0_32, c0);
sum = vmlaq_n_s32(sum, r1_32, c1);
sum = vmlaq_n_s32(sum, r2_32, c2);
sum = vmlaq_n_s32(sum, r3_32, c3);
sum = vmlaq_n_s32(sum, r4_32, c4);
sum = vmlaq_n_s32(sum, r5_32, c5);
sum = vmlaq_n_s32(sum, r6_32, c6);
sum = vmlaq_n_s32(sum, r7_32, c7);
let rnd_vec = vdupq_n_s32(rnd);
sum = vshrq_n_s32::<10>(vaddq_s32(sum, rnd_vec));
let result = vqmovn_s32(sum);
let dst_arr: &mut [i16; 4] = (&mut dst[col..col + 4]).try_into().unwrap();
safe_simd::vst1_s16(dst_arr, result);
col += 4;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[i][col] as i32;
}
dst[col] = ((sum + rnd) >> sh) as i16;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn prep_8tap_8bpc_inner(
token: Arm64,
tmp: &mut [i16],
src: &[u8],
src_stride: usize,
w: usize,
h: usize,
mx: usize,
my: usize,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let intermediate_bits = 4u8;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = [[0i16; MID_STRIDE]; 135];
for y in 0..tmp_h {
let src_offset = if y >= 3 {
(y - 3) * src_stride
} else {
0usize.wrapping_sub((3 - y) * src_stride)
};
let src_row = &src[src_offset..];
h_filter_8tap_8bpc_neon(
token,
&mut mid[y][..w],
src_row,
w,
fh,
6 - intermediate_bits,
);
}
for y in 0..h {
let out_row = &mut tmp[y * w..][..w];
v_filter_8tap_to_i16_neon(token, out_row, &mid[y..], w, fv, 6 + intermediate_bits);
}
}
(Some(fh), None) => {
for y in 0..h {
let src_row = &src[y * src_stride..];
let out_row = &mut tmp[y * w..][..w];
let mut col = 0;
while col + 8 <= w {
let c0 = fh[0] as i16;
let c1 = fh[1] as i16;
let c2 = fh[2] as i16;
let c3 = fh[3] as i16;
let c4 = fh[4] as i16;
let c5 = fh[5] as i16;
let c6 = fh[6] as i16;
let c7 = fh[7] as i16;
let s0 = safe_simd::vld1_u8(src_row[col..][..8].try_into().unwrap());
let s1 = safe_simd::vld1_u8(src_row[col + 1..][..8].try_into().unwrap());
let s2 = safe_simd::vld1_u8(src_row[col + 2..][..8].try_into().unwrap());
let s3 = safe_simd::vld1_u8(src_row[col + 3..][..8].try_into().unwrap());
let s4 = safe_simd::vld1_u8(src_row[col + 4..][..8].try_into().unwrap());
let s5 = safe_simd::vld1_u8(src_row[col + 5..][..8].try_into().unwrap());
let s6 = safe_simd::vld1_u8(src_row[col + 6..][..8].try_into().unwrap());
let s7 = safe_simd::vld1_u8(src_row[col + 7..][..8].try_into().unwrap());
let s0_16 = vreinterpretq_s16_u16(vmovl_u8(s0));
let s1_16 = vreinterpretq_s16_u16(vmovl_u8(s1));
let s2_16 = vreinterpretq_s16_u16(vmovl_u8(s2));
let s3_16 = vreinterpretq_s16_u16(vmovl_u8(s3));
let s4_16 = vreinterpretq_s16_u16(vmovl_u8(s4));
let s5_16 = vreinterpretq_s16_u16(vmovl_u8(s5));
let s6_16 = vreinterpretq_s16_u16(vmovl_u8(s6));
let s7_16 = vreinterpretq_s16_u16(vmovl_u8(s7));
let mut sum = vmulq_n_s16(s0_16, c0);
sum = vmlaq_n_s16(sum, s1_16, c1);
sum = vmlaq_n_s16(sum, s2_16, c2);
sum = vmlaq_n_s16(sum, s3_16, c3);
sum = vmlaq_n_s16(sum, s4_16, c4);
sum = vmlaq_n_s16(sum, s5_16, c5);
sum = vmlaq_n_s16(sum, s6_16, c6);
sum = vmlaq_n_s16(sum, s7_16, c7);
let rnd_vec = vdupq_n_s16(8);
let result = vshrq_n_s16::<4>(vaddq_s16(sum, rnd_vec));
let out_arr: &mut [i16; 8] = (&mut out_row[col..col + 8]).try_into().unwrap();
safe_simd::vst1q_s16(out_arr, result);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += fh[i] as i32 * src_row[col + i] as i32;
}
out_row[col] = ((sum + 8) >> intermediate_bits) as i16;
col += 1;
}
}
}
(None, Some(fv)) => {
for y in 0..h {
let out_row = &mut tmp[y * w..][..w];
let mut mid = [[0i16; MID_STRIDE]; 8];
for i in 0..8 {
let src_offset = if y + i >= 3 {
(y + i - 3) * src_stride
} else {
0usize.wrapping_sub((3 - y - i) * src_stride)
};
for x in 0..w {
mid[i][x] = (src[src_offset + x] as i16) << intermediate_bits;
}
}
v_filter_8tap_to_i16_neon(token, out_row, &mid, w, fv, 6);
}
}
(None, None) => {
for y in 0..h {
let src_row = &src[y * src_stride..][..w];
let out_row = &mut tmp[y * w..][..w];
for x in 0..w {
out_row[x] = (src_row[x] as i16) << intermediate_bits;
}
}
}
}
}
macro_rules! define_prep_8tap_8bpc {
($name:ident, $filter:expr) => {
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn $name(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Arm64::forge_token_dangerously() };
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let src_stride_u = src_stride as usize;
let src_base = (src_ptr as *const u8).offset(-3 * src_stride - 3);
let src_len = (h + 7) * src_stride_u + w + 7;
let src = std::slice::from_raw_parts(src_base, src_len);
let tmp_len = h * w;
let tmp_slice = std::slice::from_raw_parts_mut(tmp, tmp_len);
let src_adjusted = &src[3 * src_stride_u + 3..];
prep_8tap_8bpc_inner(
token,
tmp_slice,
src_adjusted,
src_stride_u,
w,
h,
mx,
my,
get_h_filter_type($filter),
get_v_filter_type($filter),
);
}
};
}
define_prep_8tap_8bpc!(prep_8tap_regular_8bpc_neon, Filter2d::Regular8Tap);
define_prep_8tap_8bpc!(
prep_8tap_regular_smooth_8bpc_neon,
Filter2d::RegularSmooth8Tap
);
define_prep_8tap_8bpc!(
prep_8tap_regular_sharp_8bpc_neon,
Filter2d::RegularSharp8Tap
);
define_prep_8tap_8bpc!(
prep_8tap_smooth_regular_8bpc_neon,
Filter2d::SmoothRegular8Tap
);
define_prep_8tap_8bpc!(prep_8tap_smooth_8bpc_neon, Filter2d::Smooth8Tap);
define_prep_8tap_8bpc!(prep_8tap_smooth_sharp_8bpc_neon, Filter2d::SmoothSharp8Tap);
define_prep_8tap_8bpc!(
prep_8tap_sharp_regular_8bpc_neon,
Filter2d::SharpRegular8Tap
);
define_prep_8tap_8bpc!(prep_8tap_sharp_smooth_8bpc_neon, Filter2d::SharpSmooth8Tap);
define_prep_8tap_8bpc!(prep_8tap_sharp_8bpc_neon, Filter2d::Sharp8Tap);
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn h_filter_8tap_16bpc_neon(
_token: Arm64,
dst: &mut [i32],
src: &[u16],
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let rnd = (1i32 << sh) >> 1;
let mut col = 0;
while col + 4 <= w {
let c0 = filter[0] as i32;
let c1 = filter[1] as i32;
let c2 = filter[2] as i32;
let c3 = filter[3] as i32;
let c4 = filter[4] as i32;
let c5 = filter[5] as i32;
let c6 = filter[6] as i32;
let c7 = filter[7] as i32;
let s0 = safe_simd::vld1_u16(src[col..][..4].try_into().unwrap());
let s1 = safe_simd::vld1_u16(src[col + 1..][..4].try_into().unwrap());
let s2 = safe_simd::vld1_u16(src[col + 2..][..4].try_into().unwrap());
let s3 = safe_simd::vld1_u16(src[col + 3..][..4].try_into().unwrap());
let s4 = safe_simd::vld1_u16(src[col + 4..][..4].try_into().unwrap());
let s5 = safe_simd::vld1_u16(src[col + 5..][..4].try_into().unwrap());
let s6 = safe_simd::vld1_u16(src[col + 6..][..4].try_into().unwrap());
let s7 = safe_simd::vld1_u16(src[col + 7..][..4].try_into().unwrap());
let s0_32 = vreinterpretq_s32_u32(vmovl_u16(s0));
let s1_32 = vreinterpretq_s32_u32(vmovl_u16(s1));
let s2_32 = vreinterpretq_s32_u32(vmovl_u16(s2));
let s3_32 = vreinterpretq_s32_u32(vmovl_u16(s3));
let s4_32 = vreinterpretq_s32_u32(vmovl_u16(s4));
let s5_32 = vreinterpretq_s32_u32(vmovl_u16(s5));
let s6_32 = vreinterpretq_s32_u32(vmovl_u16(s6));
let s7_32 = vreinterpretq_s32_u32(vmovl_u16(s7));
let mut sum = vmulq_n_s32(s0_32, c0);
sum = vmlaq_n_s32(sum, s1_32, c1);
sum = vmlaq_n_s32(sum, s2_32, c2);
sum = vmlaq_n_s32(sum, s3_32, c3);
sum = vmlaq_n_s32(sum, s4_32, c4);
sum = vmlaq_n_s32(sum, s5_32, c5);
sum = vmlaq_n_s32(sum, s6_32, c6);
sum = vmlaq_n_s32(sum, s7_32, c7);
let rnd_vec = vdupq_n_s32(rnd);
let result = vshrq_n_s32::<4>(vaddq_s32(sum, rnd_vec));
let dst_arr: &mut [i32; 4] = (&mut dst[col..col + 4]).try_into().unwrap();
safe_simd::vst1q_s32(dst_arr, result);
col += 4;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
dst[col] = (sum + rnd) >> sh;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn v_filter_8tap_16bpc_neon(
_token: Arm64,
dst: &mut [u16],
mid: &[[i32; MID_STRIDE]],
w: usize,
filter: &[i8; 8],
sh: u8,
max: u16,
) {
let mut dst = dst.flex_mut();
let rnd = (1i32 << sh) >> 1;
let mut col = 0;
while col + 4 <= w {
let c0 = filter[0] as i32;
let c1 = filter[1] as i32;
let c2 = filter[2] as i32;
let c3 = filter[3] as i32;
let c4 = filter[4] as i32;
let c5 = filter[5] as i32;
let c6 = filter[6] as i32;
let c7 = filter[7] as i32;
let r0 = safe_simd::vld1q_s32(mid[0][col..][..4].try_into().unwrap());
let r1 = safe_simd::vld1q_s32(mid[1][col..][..4].try_into().unwrap());
let r2 = safe_simd::vld1q_s32(mid[2][col..][..4].try_into().unwrap());
let r3 = safe_simd::vld1q_s32(mid[3][col..][..4].try_into().unwrap());
let r4 = safe_simd::vld1q_s32(mid[4][col..][..4].try_into().unwrap());
let r5 = safe_simd::vld1q_s32(mid[5][col..][..4].try_into().unwrap());
let r6 = safe_simd::vld1q_s32(mid[6][col..][..4].try_into().unwrap());
let r7 = safe_simd::vld1q_s32(mid[7][col..][..4].try_into().unwrap());
let mut sum = vmulq_n_s32(r0, c0);
sum = vmlaq_n_s32(sum, r1, c1);
sum = vmlaq_n_s32(sum, r2, c2);
sum = vmlaq_n_s32(sum, r3, c3);
sum = vmlaq_n_s32(sum, r4, c4);
sum = vmlaq_n_s32(sum, r5, c5);
sum = vmlaq_n_s32(sum, r6, c6);
sum = vmlaq_n_s32(sum, r7, c7);
let rnd_vec = vdupq_n_s32(rnd);
sum = vaddq_s32(sum, rnd_vec);
let result = vshrq_n_s32::<8>(sum);
let max_vec = vdupq_n_s32(max as i32);
let zero = vdupq_n_s32(0);
let clamped = vminq_s32(vmaxq_s32(result, zero), max_vec);
let narrow = vqmovun_s32(clamped);
let dst_arr: &mut [u16; 4] = (&mut dst[col..col + 4]).try_into().unwrap();
safe_simd::vst1_u16(dst_arr, narrow);
col += 4;
}
while col < w {
let mut sum = 0i64;
for i in 0..8 {
sum += filter[i] as i64 * mid[i][col] as i64;
}
dst[col] = (((sum + rnd as i64) >> sh) as i32).clamp(0, max as i32) as u16;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn h_filter_8tap_16bpc_put_neon(
_token: Arm64,
dst: &mut [u16],
src: &[u16],
w: usize,
filter: &[i8; 8],
max: u16,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let mut col = 0;
while col + 4 <= w {
let c0 = filter[0] as i32;
let c1 = filter[1] as i32;
let c2 = filter[2] as i32;
let c3 = filter[3] as i32;
let c4 = filter[4] as i32;
let c5 = filter[5] as i32;
let c6 = filter[6] as i32;
let c7 = filter[7] as i32;
let s0 = safe_simd::vld1_u16(src[col..][..4].try_into().unwrap());
let s1 = safe_simd::vld1_u16(src[col + 1..][..4].try_into().unwrap());
let s2 = safe_simd::vld1_u16(src[col + 2..][..4].try_into().unwrap());
let s3 = safe_simd::vld1_u16(src[col + 3..][..4].try_into().unwrap());
let s4 = safe_simd::vld1_u16(src[col + 4..][..4].try_into().unwrap());
let s5 = safe_simd::vld1_u16(src[col + 5..][..4].try_into().unwrap());
let s6 = safe_simd::vld1_u16(src[col + 6..][..4].try_into().unwrap());
let s7 = safe_simd::vld1_u16(src[col + 7..][..4].try_into().unwrap());
let s0_32 = vreinterpretq_s32_u32(vmovl_u16(s0));
let s1_32 = vreinterpretq_s32_u32(vmovl_u16(s1));
let s2_32 = vreinterpretq_s32_u32(vmovl_u16(s2));
let s3_32 = vreinterpretq_s32_u32(vmovl_u16(s3));
let s4_32 = vreinterpretq_s32_u32(vmovl_u16(s4));
let s5_32 = vreinterpretq_s32_u32(vmovl_u16(s5));
let s6_32 = vreinterpretq_s32_u32(vmovl_u16(s6));
let s7_32 = vreinterpretq_s32_u32(vmovl_u16(s7));
let mut sum = vmulq_n_s32(s0_32, c0);
sum = vmlaq_n_s32(sum, s1_32, c1);
sum = vmlaq_n_s32(sum, s2_32, c2);
sum = vmlaq_n_s32(sum, s3_32, c3);
sum = vmlaq_n_s32(sum, s4_32, c4);
sum = vmlaq_n_s32(sum, s5_32, c5);
sum = vmlaq_n_s32(sum, s6_32, c6);
sum = vmlaq_n_s32(sum, s7_32, c7);
let rnd_vec = vdupq_n_s32(32);
let result = vshrq_n_s32::<6>(vaddq_s32(sum, rnd_vec));
let max_vec = vdupq_n_s32(max as i32);
let zero = vdupq_n_s32(0);
let clamped = vminq_s32(vmaxq_s32(result, zero), max_vec);
let narrow = vqmovun_s32(clamped);
let dst_arr: &mut [u16; 4] = (&mut dst[col..col + 4]).try_into().unwrap();
safe_simd::vst1_u16(dst_arr, narrow);
col += 4;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
dst[col] = ((sum + 32) >> 6).clamp(0, max as i32) as u16;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn v_filter_8tap_16bpc_direct_neon(
_token: Arm64,
dst: &mut [u16],
src: &[u16],
src_stride: usize,
w: usize,
filter: &[i8; 8],
max: u16,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let mut col = 0;
while col + 4 <= w {
let c0 = filter[0] as i32;
let c1 = filter[1] as i32;
let c2 = filter[2] as i32;
let c3 = filter[3] as i32;
let c4 = filter[4] as i32;
let c5 = filter[5] as i32;
let c6 = filter[6] as i32;
let c7 = filter[7] as i32;
let r0 = safe_simd::vld1_u16(src[col..][..4].try_into().unwrap());
let r1 = safe_simd::vld1_u16(src[col + src_stride..][..4].try_into().unwrap());
let r2 = safe_simd::vld1_u16(src[col + 2 * src_stride..][..4].try_into().unwrap());
let r3 = safe_simd::vld1_u16(src[col + 3 * src_stride..][..4].try_into().unwrap());
let r4 = safe_simd::vld1_u16(src[col + 4 * src_stride..][..4].try_into().unwrap());
let r5 = safe_simd::vld1_u16(src[col + 5 * src_stride..][..4].try_into().unwrap());
let r6 = safe_simd::vld1_u16(src[col + 6 * src_stride..][..4].try_into().unwrap());
let r7 = safe_simd::vld1_u16(src[col + 7 * src_stride..][..4].try_into().unwrap());
let r0_32 = vreinterpretq_s32_u32(vmovl_u16(r0));
let r1_32 = vreinterpretq_s32_u32(vmovl_u16(r1));
let r2_32 = vreinterpretq_s32_u32(vmovl_u16(r2));
let r3_32 = vreinterpretq_s32_u32(vmovl_u16(r3));
let r4_32 = vreinterpretq_s32_u32(vmovl_u16(r4));
let r5_32 = vreinterpretq_s32_u32(vmovl_u16(r5));
let r6_32 = vreinterpretq_s32_u32(vmovl_u16(r6));
let r7_32 = vreinterpretq_s32_u32(vmovl_u16(r7));
let mut sum = vmulq_n_s32(r0_32, c0);
sum = vmlaq_n_s32(sum, r1_32, c1);
sum = vmlaq_n_s32(sum, r2_32, c2);
sum = vmlaq_n_s32(sum, r3_32, c3);
sum = vmlaq_n_s32(sum, r4_32, c4);
sum = vmlaq_n_s32(sum, r5_32, c5);
sum = vmlaq_n_s32(sum, r6_32, c6);
sum = vmlaq_n_s32(sum, r7_32, c7);
let rnd_vec = vdupq_n_s32(32);
sum = vshrq_n_s32::<6>(vaddq_s32(sum, rnd_vec));
let max_vec = vdupq_n_s32(max as i32);
let zero = vdupq_n_s32(0);
let clamped = vminq_s32(vmaxq_s32(sum, zero), max_vec);
let narrow = vqmovun_s32(clamped);
let dst_arr: &mut [u16; 4] = (&mut dst[col..col + 4]).try_into().unwrap();
safe_simd::vst1_u16(dst_arr, narrow);
col += 4;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i * src_stride] as i32;
}
dst[col] = ((sum + 32) >> 6).clamp(0, max as i32) as u16;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn put_8tap_16bpc_inner(
token: Arm64,
dst: &mut [u16],
dst_stride: usize,
src: &[u16],
src_stride: usize,
w: usize,
h: usize,
mx: usize,
my: usize,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
bitdepth_max: u16,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let intermediate_bits = 4u8;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = [[0i32; MID_STRIDE]; 135];
for y in 0..tmp_h {
let src_offset = if y >= 3 {
(y - 3) * src_stride
} else {
0usize.wrapping_sub((3 - y) * src_stride)
};
let src_row = &src[src_offset..];
h_filter_8tap_16bpc_neon(
token,
&mut mid[y][..w],
src_row,
w,
fh,
6 - intermediate_bits,
);
}
for y in 0..h {
let dst_row = &mut dst[y * dst_stride..][..w];
v_filter_8tap_16bpc_neon(
token,
dst_row,
&mid[y..],
w,
fv,
6 + intermediate_bits,
bitdepth_max,
);
}
}
(Some(fh), None) => {
for y in 0..h {
let src_row = &src[y * src_stride..];
let dst_row = &mut dst[y * dst_stride..][..w];
h_filter_8tap_16bpc_put_neon(token, dst_row, src_row, w, fh, bitdepth_max);
}
}
(None, Some(fv)) => {
for y in 0..h {
let src_offset = if y >= 3 {
(y - 3) * src_stride
} else {
0usize.wrapping_sub((3 - y) * src_stride)
};
let src_row = &src[src_offset..];
let dst_row = &mut dst[y * dst_stride..][..w];
v_filter_8tap_16bpc_direct_neon(
token,
dst_row,
src_row,
src_stride,
w,
fv,
bitdepth_max,
);
}
}
(None, None) => {
for y in 0..h {
let src_row = &src[y * src_stride..][..w];
let dst_row = &mut dst[y * dst_stride..][..w];
dst_row.copy_from_slice(src_row);
}
}
}
}
macro_rules! define_put_8tap_16bpc {
($name:ident, $filter:expr) => {
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn $name(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
_src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Arm64::forge_token_dangerously() };
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let dst_stride_u16 = (dst_stride / 2) as usize;
let src_stride_u16 = (src_stride / 2) as usize;
let src_base = (src_ptr as *const u16).offset(-3 * src_stride_u16 as isize - 3);
let src_len = (h + 7) * src_stride_u16 + w + 7;
let src = std::slice::from_raw_parts(src_base, src_len);
let dst_len = h * dst_stride_u16 + w;
let dst = std::slice::from_raw_parts_mut(dst_ptr as *mut u16, dst_len);
let src_adjusted = &src[3 * src_stride_u16 + 3..];
put_8tap_16bpc_inner(
token,
dst,
dst_stride_u16,
src_adjusted,
src_stride_u16,
w,
h,
mx,
my,
get_h_filter_type($filter),
get_v_filter_type($filter),
bitdepth_max as u16,
);
}
};
}
define_put_8tap_16bpc!(put_8tap_regular_16bpc_neon, Filter2d::Regular8Tap);
define_put_8tap_16bpc!(
put_8tap_regular_smooth_16bpc_neon,
Filter2d::RegularSmooth8Tap
);
define_put_8tap_16bpc!(
put_8tap_regular_sharp_16bpc_neon,
Filter2d::RegularSharp8Tap
);
define_put_8tap_16bpc!(
put_8tap_smooth_regular_16bpc_neon,
Filter2d::SmoothRegular8Tap
);
define_put_8tap_16bpc!(put_8tap_smooth_16bpc_neon, Filter2d::Smooth8Tap);
define_put_8tap_16bpc!(put_8tap_smooth_sharp_16bpc_neon, Filter2d::SmoothSharp8Tap);
define_put_8tap_16bpc!(
put_8tap_sharp_regular_16bpc_neon,
Filter2d::SharpRegular8Tap
);
define_put_8tap_16bpc!(put_8tap_sharp_smooth_16bpc_neon, Filter2d::SharpSmooth8Tap);
define_put_8tap_16bpc!(put_8tap_sharp_16bpc_neon, Filter2d::Sharp8Tap);
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn v_filter_8tap_16bpc_to_i16_neon(
_token: Arm64,
dst: &mut [i16],
mid: &[[i32; MID_STRIDE]],
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let mut dst = dst.flex_mut();
let rnd = (1i32 << sh) >> 1;
let mut col = 0;
while col + 4 <= w {
let c0 = filter[0] as i32;
let c1 = filter[1] as i32;
let c2 = filter[2] as i32;
let c3 = filter[3] as i32;
let c4 = filter[4] as i32;
let c5 = filter[5] as i32;
let c6 = filter[6] as i32;
let c7 = filter[7] as i32;
let r0 = safe_simd::vld1q_s32(mid[0][col..][..4].try_into().unwrap());
let r1 = safe_simd::vld1q_s32(mid[1][col..][..4].try_into().unwrap());
let r2 = safe_simd::vld1q_s32(mid[2][col..][..4].try_into().unwrap());
let r3 = safe_simd::vld1q_s32(mid[3][col..][..4].try_into().unwrap());
let r4 = safe_simd::vld1q_s32(mid[4][col..][..4].try_into().unwrap());
let r5 = safe_simd::vld1q_s32(mid[5][col..][..4].try_into().unwrap());
let r6 = safe_simd::vld1q_s32(mid[6][col..][..4].try_into().unwrap());
let r7 = safe_simd::vld1q_s32(mid[7][col..][..4].try_into().unwrap());
let mut sum = vmulq_n_s32(r0, c0);
sum = vmlaq_n_s32(sum, r1, c1);
sum = vmlaq_n_s32(sum, r2, c2);
sum = vmlaq_n_s32(sum, r3, c3);
sum = vmlaq_n_s32(sum, r4, c4);
sum = vmlaq_n_s32(sum, r5, c5);
sum = vmlaq_n_s32(sum, r6, c6);
sum = vmlaq_n_s32(sum, r7, c7);
let rnd_vec = vdupq_n_s32(rnd);
sum = vshrq_n_s32::<8>(vaddq_s32(sum, rnd_vec));
let result = vqmovn_s32(sum);
let dst_arr: &mut [i16; 4] = (&mut dst[col..col + 4]).try_into().unwrap();
safe_simd::vst1_s16(dst_arr, result);
col += 4;
}
while col < w {
let mut sum = 0i64;
for i in 0..8 {
sum += filter[i] as i64 * mid[i][col] as i64;
}
dst[col] = ((sum + rnd as i64) >> sh) as i16;
col += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn prep_8tap_16bpc_inner(
token: Arm64,
tmp: &mut [i16],
src: &[u16],
src_stride: usize,
w: usize,
h: usize,
mx: usize,
my: usize,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let intermediate_bits = 4u8;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = [[0i32; MID_STRIDE]; 135];
for y in 0..tmp_h {
let src_offset = if y >= 3 {
(y - 3) * src_stride
} else {
0usize.wrapping_sub((3 - y) * src_stride)
};
let src_row = &src[src_offset..];
h_filter_8tap_16bpc_neon(
token,
&mut mid[y][..w],
src_row,
w,
fh,
6 - intermediate_bits,
);
}
for y in 0..h {
let out_row = &mut tmp[y * w..][..w];
v_filter_8tap_16bpc_to_i16_neon(
token,
out_row,
&mid[y..],
w,
fv,
6 + intermediate_bits,
);
}
}
(Some(fh), None) => {
for y in 0..h {
let src_row = &src[y * src_stride..];
let out_row = &mut tmp[y * w..][..w];
let mut col = 0;
while col + 4 <= w {
let c0 = fh[0] as i32;
let c1 = fh[1] as i32;
let c2 = fh[2] as i32;
let c3 = fh[3] as i32;
let c4 = fh[4] as i32;
let c5 = fh[5] as i32;
let c6 = fh[6] as i32;
let c7 = fh[7] as i32;
let s0 = safe_simd::vld1_u16(src_row[col..][..4].try_into().unwrap());
let s1 = safe_simd::vld1_u16(src_row[col + 1..][..4].try_into().unwrap());
let s2 = safe_simd::vld1_u16(src_row[col + 2..][..4].try_into().unwrap());
let s3 = safe_simd::vld1_u16(src_row[col + 3..][..4].try_into().unwrap());
let s4 = safe_simd::vld1_u16(src_row[col + 4..][..4].try_into().unwrap());
let s5 = safe_simd::vld1_u16(src_row[col + 5..][..4].try_into().unwrap());
let s6 = safe_simd::vld1_u16(src_row[col + 6..][..4].try_into().unwrap());
let s7 = safe_simd::vld1_u16(src_row[col + 7..][..4].try_into().unwrap());
let s0_32 = vreinterpretq_s32_u32(vmovl_u16(s0));
let s1_32 = vreinterpretq_s32_u32(vmovl_u16(s1));
let s2_32 = vreinterpretq_s32_u32(vmovl_u16(s2));
let s3_32 = vreinterpretq_s32_u32(vmovl_u16(s3));
let s4_32 = vreinterpretq_s32_u32(vmovl_u16(s4));
let s5_32 = vreinterpretq_s32_u32(vmovl_u16(s5));
let s6_32 = vreinterpretq_s32_u32(vmovl_u16(s6));
let s7_32 = vreinterpretq_s32_u32(vmovl_u16(s7));
let mut sum = vmulq_n_s32(s0_32, c0);
sum = vmlaq_n_s32(sum, s1_32, c1);
sum = vmlaq_n_s32(sum, s2_32, c2);
sum = vmlaq_n_s32(sum, s3_32, c3);
sum = vmlaq_n_s32(sum, s4_32, c4);
sum = vmlaq_n_s32(sum, s5_32, c5);
sum = vmlaq_n_s32(sum, s6_32, c6);
sum = vmlaq_n_s32(sum, s7_32, c7);
let rnd_vec = vdupq_n_s32(8);
let result = vshrq_n_s32::<4>(vaddq_s32(sum, rnd_vec));
let narrow = vqmovn_s32(result);
let out_arr: &mut [i16; 4] = (&mut out_row[col..col + 4]).try_into().unwrap();
safe_simd::vst1_s16(out_arr, narrow);
col += 4;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += fh[i] as i32 * src_row[col + i] as i32;
}
out_row[col] = ((sum + 8) >> intermediate_bits) as i16;
col += 1;
}
}
}
(None, Some(fv)) => {
for y in 0..h {
let out_row = &mut tmp[y * w..][..w];
let mut mid = [[0i32; MID_STRIDE]; 8];
for i in 0..8 {
let src_offset = if y + i >= 3 {
(y + i - 3) * src_stride
} else {
0usize.wrapping_sub((3 - y - i) * src_stride)
};
for x in 0..w {
mid[i][x] = (src[src_offset + x] as i32) << intermediate_bits;
}
}
v_filter_8tap_16bpc_to_i16_neon(token, out_row, &mid, w, fv, 6);
}
}
(None, None) => {
for y in 0..h {
let src_row = &src[y * src_stride..][..w];
let out_row = &mut tmp[y * w..][..w];
for x in 0..w {
out_row[x] = (src_row[x] as i32 >> (10 - intermediate_bits)) as i16;
}
}
}
}
}
macro_rules! define_prep_8tap_16bpc {
($name:ident, $filter:expr) => {
#[cfg(feature = "asm")]
#[cfg(target_arch = "aarch64")]
pub unsafe extern "C" fn $name(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Arm64::forge_token_dangerously() };
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let src_stride_u16 = (src_stride / 2) as usize;
let src_base = (src_ptr as *const u16).offset(-3 * src_stride_u16 as isize - 3);
let src_len = (h + 7) * src_stride_u16 + w + 7;
let src = std::slice::from_raw_parts(src_base, src_len);
let tmp_len = h * w;
let tmp_slice = std::slice::from_raw_parts_mut(tmp, tmp_len);
let src_adjusted = &src[3 * src_stride_u16 + 3..];
prep_8tap_16bpc_inner(
token,
tmp_slice,
src_adjusted,
src_stride_u16,
w,
h,
mx,
my,
get_h_filter_type($filter),
get_v_filter_type($filter),
);
}
};
}
define_prep_8tap_16bpc!(prep_8tap_regular_16bpc_neon, Filter2d::Regular8Tap);
define_prep_8tap_16bpc!(
prep_8tap_regular_smooth_16bpc_neon,
Filter2d::RegularSmooth8Tap
);
define_prep_8tap_16bpc!(
prep_8tap_regular_sharp_16bpc_neon,
Filter2d::RegularSharp8Tap
);
define_prep_8tap_16bpc!(
prep_8tap_smooth_regular_16bpc_neon,
Filter2d::SmoothRegular8Tap
);
define_prep_8tap_16bpc!(prep_8tap_smooth_16bpc_neon, Filter2d::Smooth8Tap);
define_prep_8tap_16bpc!(prep_8tap_smooth_sharp_16bpc_neon, Filter2d::SmoothSharp8Tap);
define_prep_8tap_16bpc!(
prep_8tap_sharp_regular_16bpc_neon,
Filter2d::SharpRegular8Tap
);
define_prep_8tap_16bpc!(prep_8tap_sharp_smooth_16bpc_neon, Filter2d::SharpSmooth8Tap);
define_prep_8tap_16bpc!(prep_8tap_sharp_16bpc_neon, Filter2d::Sharp8Tap);
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub fn avg_dispatch<BD: BitDepth>(
dst: PicOffset,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bd: BD,
) -> bool {
use zerocopy::IntoBytes;
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (mut dst_guard, dst_base) = dst.narrow_guard_mut::<BD>(w as usize, h as usize);
let dst_bytes = dst_guard.as_mut_bytes();
let dst_offset = dst_base * pixel_size;
let dst_stride = dst.stride();
avg_dispatch_inner::<BD>(dst_bytes, dst_offset, dst_stride, tmp1, tmp2, w, h, bd)
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub(crate) fn avg_dispatch_inner<BD: BitDepth>(
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
#[cfg(feature = "asm")]
{
#[allow(unsafe_code)]
{
let dst_ptr = unsafe { (dst_bytes.as_mut_ptr() as *mut DynPixel).add(dst_offset) };
let bd_c = bd.into_c();
unsafe {
match BD::BPC {
BPC::BPC8 => avg_8bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
bd_c,
std::ptr::null(),
),
BPC::BPC16 => avg_16bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
bd_c,
std::ptr::null(),
),
}
}
}
}
#[cfg(not(feature = "asm"))]
{
let token = Arm64::summon().unwrap();
let w_u = w as usize;
let h_u = h as usize;
let dst_stride_u = dst_stride as usize;
match BD::BPC {
BPC::BPC8 => {
avg_8bpc_inner(
token,
&mut dst_bytes[dst_offset..],
dst_stride_u,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
);
}
BPC::BPC16 => {
use zerocopy::FromBytes;
let stride_u16 = dst_stride_u / 2;
let start = dst_offset;
let byte_len = (h_u * stride_u16 + w_u) * 2;
let dst_u16: &mut [u16] =
FromBytes::mut_from_bytes(&mut dst_bytes[start..start + byte_len]).unwrap();
avg_16bpc_inner(
token,
dst_u16,
stride_u16,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
bd.into_c(),
);
}
}
}
true
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub fn w_avg_dispatch<BD: BitDepth>(
dst: PicOffset,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
bd: BD,
) -> bool {
use zerocopy::IntoBytes;
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (mut dst_guard, dst_base) = dst.narrow_guard_mut::<BD>(w as usize, h as usize);
let dst_bytes = dst_guard.as_mut_bytes();
let dst_offset = dst_base * pixel_size;
let dst_stride = dst.stride();
w_avg_dispatch_inner::<BD>(
dst_bytes, dst_offset, dst_stride, tmp1, tmp2, w, h, weight, bd,
)
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub(crate) fn w_avg_dispatch_inner<BD: BitDepth>(
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
#[cfg(feature = "asm")]
{
#[allow(unsafe_code)]
{
let dst_ptr = unsafe { (dst_bytes.as_mut_ptr() as *mut DynPixel).add(dst_offset) };
let bd_c = bd.into_c();
unsafe {
match BD::BPC {
BPC::BPC8 => w_avg_8bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
weight,
bd_c,
std::ptr::null(),
),
BPC::BPC16 => w_avg_16bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
weight,
bd_c,
std::ptr::null(),
),
}
}
}
}
#[cfg(not(feature = "asm"))]
{
let token = Arm64::summon().unwrap();
let w_u = w as usize;
let h_u = h as usize;
let dst_stride_u = dst_stride as usize;
match BD::BPC {
BPC::BPC8 => {
w_avg_8bpc_inner(
token,
&mut dst_bytes[dst_offset..],
dst_stride_u,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
weight,
);
}
BPC::BPC16 => {
use zerocopy::FromBytes;
let stride_u16 = dst_stride_u / 2;
let start = dst_offset;
let byte_len = (h_u * stride_u16 + w_u) * 2;
let dst_u16: &mut [u16] =
FromBytes::mut_from_bytes(&mut dst_bytes[start..start + byte_len]).unwrap();
w_avg_16bpc_inner(
token,
dst_u16,
stride_u16,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
weight,
bd.into_c(),
);
}
}
}
true
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub fn mask_dispatch<BD: BitDepth>(
dst: PicOffset,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &[u8],
bd: BD,
) -> bool {
use zerocopy::IntoBytes;
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (mut dst_guard, dst_base) = dst.narrow_guard_mut::<BD>(w as usize, h as usize);
let dst_bytes = dst_guard.as_mut_bytes();
let dst_offset = dst_base * pixel_size;
let dst_stride = dst.stride();
mask_dispatch_inner::<BD>(
dst_bytes, dst_offset, dst_stride, tmp1, tmp2, w, h, mask, bd,
)
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub(crate) fn mask_dispatch_inner<BD: BitDepth>(
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &[u8],
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
#[cfg(feature = "asm")]
{
#[allow(unsafe_code)]
{
let dst_ptr = unsafe { (dst_bytes.as_mut_ptr() as *mut DynPixel).add(dst_offset) };
let mask_ptr = mask[..(w * h) as usize].as_ptr();
let bd_c = bd.into_c();
unsafe {
match BD::BPC {
BPC::BPC8 => mask_8bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
mask_ptr,
bd_c,
std::ptr::null(),
),
BPC::BPC16 => mask_16bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
mask_ptr,
bd_c,
std::ptr::null(),
),
}
}
}
}
#[cfg(not(feature = "asm"))]
{
let token = Arm64::summon().unwrap();
let w_u = w as usize;
let h_u = h as usize;
let mask_slice = &mask[..(w_u * h_u)];
let dst_stride_u = dst_stride as usize;
match BD::BPC {
BPC::BPC8 => {
mask_8bpc_inner(
token,
&mut dst_bytes[dst_offset..],
dst_stride_u,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
mask_slice,
);
}
BPC::BPC16 => {
use zerocopy::FromBytes;
let stride_u16 = dst_stride_u / 2;
let start = dst_offset;
let byte_len = (h_u * stride_u16 + w_u) * 2;
let dst_u16: &mut [u16] =
FromBytes::mut_from_bytes(&mut dst_bytes[start..start + byte_len]).unwrap();
mask_16bpc_inner(
token,
dst_u16,
stride_u16,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
mask_slice,
bd.into_c(),
);
}
}
}
true
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub fn blend_dispatch<BD: BitDepth>(
dst: PicOffset,
tmp: &[BD::Pixel; SCRATCH_INTER_INTRA_BUF_LEN],
w: i32,
h: i32,
mask: &[u8],
) -> bool {
use zerocopy::IntoBytes;
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (mut dst_guard, dst_base) = dst.narrow_guard_mut::<BD>(w as usize, h as usize);
let dst_bytes = dst_guard.as_mut_bytes();
let dst_offset = dst_base * pixel_size;
let dst_stride = dst.stride();
blend_dispatch_inner::<BD>(dst_bytes, dst_offset, dst_stride, tmp, w, h, mask)
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub(crate) fn blend_dispatch_inner<BD: BitDepth>(
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp: &[BD::Pixel; SCRATCH_INTER_INTRA_BUF_LEN],
w: i32,
h: i32,
mask: &[u8],
) -> bool {
use crate::include::common::bitdepth::BPC;
#[cfg(feature = "asm")]
{
#[allow(unsafe_code)]
{
let dst_ptr = unsafe { (dst_bytes.as_mut_ptr() as *mut DynPixel).add(dst_offset) };
let tmp_ptr = std::ptr::from_ref(tmp).cast();
let mask_ptr = mask[..(w * h) as usize].as_ptr();
unsafe {
match BD::BPC {
BPC::BPC8 => blend_8bpc_neon(
dst_ptr,
dst_stride,
tmp_ptr,
w,
h,
mask_ptr,
std::ptr::null(),
),
BPC::BPC16 => blend_16bpc_neon(
dst_ptr,
dst_stride,
tmp_ptr,
w,
h,
mask_ptr,
std::ptr::null(),
),
}
}
}
}
#[cfg(not(feature = "asm"))]
{
let w_u = w as usize;
let h_u = h as usize;
let dst_stride_u = dst_stride as usize;
let mask_slice = &mask[..(w_u * h_u)];
match BD::BPC {
BPC::BPC8 => {
let dst_slice = &mut dst_bytes[dst_offset..];
let tmp_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(tmp.as_slice());
for row in 0..h_u {
let dst_row = &mut dst_slice[row * dst_stride_u..][..w_u];
let tmp_row = &tmp_bytes[row * w_u..][..w_u];
let mask_row = &mask_slice[row * w_u..][..w_u];
for col in 0..w_u {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
let m = mask_row[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u8;
}
}
}
BPC::BPC16 => {
use zerocopy::{FromBytes, IntoBytes};
let stride_u16 = dst_stride_u / 2;
let start = dst_offset;
let dst_byte_len = (h_u * stride_u16 + w_u) * 2;
let dst_u16: &mut [u16] =
FromBytes::mut_from_bytes(&mut dst_bytes[start..start + dst_byte_len]).unwrap();
let tmp_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(tmp.as_slice());
let tmp_byte_len = w_u * h_u * 2;
let tmp_u16: &[u16] =
FromBytes::ref_from_bytes(&tmp_bytes[..tmp_byte_len]).unwrap();
for row in 0..h_u {
let dst_row = &mut dst_u16[row * stride_u16..][..w_u];
let tmp_row = &tmp_u16[row * w_u..][..w_u];
let mask_row = &mask_slice[row * w_u..][..w_u];
for col in 0..w_u {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
let m = mask_row[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u16;
}
}
}
}
}
true
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub fn blend_dir_dispatch<BD: BitDepth>(
is_h: bool,
dst: PicOffset,
tmp: &[BD::Pixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
) -> bool {
use zerocopy::IntoBytes;
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (mut dst_guard, dst_base) = dst.narrow_guard_mut::<BD>(w as usize, h as usize);
let dst_bytes = dst_guard.as_mut_bytes();
let dst_offset = dst_base * pixel_size;
let dst_stride = dst.stride();
blend_dir_dispatch_inner::<BD>(is_h, dst_bytes, dst_offset, dst_stride, tmp, w, h)
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub(crate) fn blend_dir_dispatch_inner<BD: BitDepth>(
is_h: bool,
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp: &[BD::Pixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
) -> bool {
use crate::include::common::bitdepth::BPC;
#[cfg(feature = "asm")]
{
#[allow(unsafe_code)]
{
let dst_ptr = unsafe { (dst_bytes.as_mut_ptr() as *mut DynPixel).add(dst_offset) };
let tmp_ptr = std::ptr::from_ref(tmp).cast();
unsafe {
match (BD::BPC, is_h) {
(BPC::BPC8, true) => {
blend_h_8bpc_neon(dst_ptr, dst_stride, tmp_ptr, w, h, std::ptr::null())
}
(BPC::BPC8, false) => {
blend_v_8bpc_neon(dst_ptr, dst_stride, tmp_ptr, w, h, std::ptr::null())
}
(BPC::BPC16, true) => {
blend_h_16bpc_neon(dst_ptr, dst_stride, tmp_ptr, w, h, std::ptr::null())
}
(BPC::BPC16, false) => {
blend_v_16bpc_neon(dst_ptr, dst_stride, tmp_ptr, w, h, std::ptr::null())
}
}
}
}
}
#[cfg(not(feature = "asm"))]
{
use crate::src::tables::dav1d_obmc_masks;
let w_u = w as usize;
let h_u = h as usize;
let dst_stride_u = dst_stride as usize;
match (BD::BPC, is_h) {
(BPC::BPC8, false) => {
let dst_slice = &mut dst_bytes[dst_offset..];
let tmp_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(tmp.as_slice());
let mask = &dav1d_obmc_masks[w_u..];
let dst_w = w_u * 3 >> 2;
for row in 0..h_u {
let dst_row = &mut dst_slice[row * dst_stride_u..][..dst_w];
let tmp_row = &tmp_bytes[row * w_u..][..dst_w];
for col in 0..dst_w {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
let m = mask[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u8;
}
}
}
(BPC::BPC8, true) => {
let dst_slice = &mut dst_bytes[dst_offset..];
let tmp_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(tmp.as_slice());
let mask = &dav1d_obmc_masks[h_u..];
let h_effective = h_u * 3 >> 2;
for row in 0..h_effective {
let dst_row = &mut dst_slice[row * dst_stride_u..][..w_u];
let tmp_row = &tmp_bytes[row * w_u..][..w_u];
let m = mask[row] as u32;
for col in 0..w_u {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u8;
}
}
}
(BPC::BPC16, false) => {
use zerocopy::{FromBytes, IntoBytes};
let stride_u16 = dst_stride_u / 2;
let start = dst_offset;
let dst_byte_len = (h_u * stride_u16 + w_u) * 2;
let dst_u16: &mut [u16] =
FromBytes::mut_from_bytes(&mut dst_bytes[start..start + dst_byte_len]).unwrap();
let tmp_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(tmp.as_slice());
let tmp_byte_len = w_u * h_u * 2;
let tmp_u16: &[u16] =
FromBytes::ref_from_bytes(&tmp_bytes[..tmp_byte_len]).unwrap();
let mask = &dav1d_obmc_masks[w_u..];
let dst_w = w_u * 3 >> 2;
for row in 0..h_u {
let dst_row = &mut dst_u16[row * stride_u16..][..dst_w];
let tmp_row = &tmp_u16[row * w_u..][..dst_w];
for col in 0..dst_w {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
let m = mask[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u16;
}
}
}
(BPC::BPC16, true) => {
use zerocopy::{FromBytes, IntoBytes};
let stride_u16 = dst_stride_u / 2;
let start = dst_offset;
let mask = &dav1d_obmc_masks[h_u..];
let h_effective = h_u * 3 >> 2;
let dst_byte_len = (h_effective * stride_u16 + w_u) * 2;
let dst_u16: &mut [u16] =
FromBytes::mut_from_bytes(&mut dst_bytes[start..start + dst_byte_len]).unwrap();
let tmp_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(tmp.as_slice());
let tmp_byte_len = w_u * h_effective * 2;
let tmp_u16: &[u16] =
FromBytes::ref_from_bytes(&tmp_bytes[..tmp_byte_len]).unwrap();
for row in 0..h_effective {
let dst_row = &mut dst_u16[row * stride_u16..][..w_u];
let tmp_row = &tmp_u16[row * w_u..][..w_u];
let m = mask[row] as u32;
for col in 0..w_u {
let d = dst_row[col] as u32;
let t = tmp_row[col] as u32;
dst_row[col] = ((d * (64 - m) + t * m + 32) >> 6) as u16;
}
}
}
}
}
true
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub fn w_mask_dispatch<BD: BitDepth>(
layout: Rav1dPixelLayoutSubSampled,
dst: PicOffset,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bd: BD,
) -> bool {
use zerocopy::IntoBytes;
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (mut dst_guard, dst_base) = dst.narrow_guard_mut::<BD>(w as usize, h as usize);
let dst_bytes = dst_guard.as_mut_bytes();
let dst_offset = dst_base * pixel_size;
let dst_stride = dst.stride();
w_mask_dispatch_inner::<BD>(
layout, dst_bytes, dst_offset, dst_stride, tmp1, tmp2, w, h, mask, sign, bd,
)
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub(crate) fn w_mask_dispatch_inner<BD: BitDepth>(
layout: Rav1dPixelLayoutSubSampled,
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
#[cfg(feature = "asm")]
{
#[allow(unsafe_code)]
{
let dst_ptr = unsafe { (dst_bytes.as_mut_ptr() as *mut DynPixel).add(dst_offset) };
let bd_c = bd.into_c();
unsafe {
match (BD::BPC, layout) {
(BPC::BPC8, Rav1dPixelLayoutSubSampled::I420) => w_mask_420_8bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
std::ptr::null(),
),
(BPC::BPC8, Rav1dPixelLayoutSubSampled::I422) => w_mask_422_8bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
std::ptr::null(),
),
(BPC::BPC8, Rav1dPixelLayoutSubSampled::I444) => w_mask_444_8bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
std::ptr::null(),
),
(BPC::BPC16, Rav1dPixelLayoutSubSampled::I420) => w_mask_420_16bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
std::ptr::null(),
),
(BPC::BPC16, Rav1dPixelLayoutSubSampled::I422) => w_mask_422_16bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
std::ptr::null(),
),
(BPC::BPC16, Rav1dPixelLayoutSubSampled::I444) => w_mask_444_16bpc_neon(
dst_ptr,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
std::ptr::null(),
),
}
}
}
}
#[cfg(not(feature = "asm"))]
{
let token = Arm64::summon().unwrap();
let w_u = w as usize;
let h_u = h as usize;
let dst_stride_u = dst_stride as usize;
match BD::BPC {
BPC::BPC8 => {
let dst_slice = &mut dst_bytes[dst_offset..];
match layout {
Rav1dPixelLayoutSubSampled::I420 => w_mask_8bpc_inner(
token,
dst_slice,
dst_stride_u,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
&mut mask[..],
sign as u8,
true,
true,
),
Rav1dPixelLayoutSubSampled::I422 => w_mask_8bpc_inner(
token,
dst_slice,
dst_stride_u,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
&mut mask[..],
sign as u8,
true,
false,
),
Rav1dPixelLayoutSubSampled::I444 => w_mask_8bpc_inner(
token,
dst_slice,
dst_stride_u,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
&mut mask[..],
sign as u8,
false,
false,
),
}
}
BPC::BPC16 => {
use zerocopy::FromBytes;
let stride_u16 = dst_stride_u / 2;
let start = dst_offset;
let byte_len = (h_u * stride_u16 + w_u) * 2;
let dst_u16: &mut [u16] =
FromBytes::mut_from_bytes(&mut dst_bytes[start..start + byte_len]).unwrap();
let bd_c = bd.into_c();
match layout {
Rav1dPixelLayoutSubSampled::I420 => w_mask_16bpc_inner(
dst_u16,
stride_u16,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
&mut mask[..],
sign as u8,
bd_c,
true,
true,
),
Rav1dPixelLayoutSubSampled::I422 => w_mask_16bpc_inner(
dst_u16,
stride_u16,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
&mut mask[..],
sign as u8,
bd_c,
true,
false,
),
Rav1dPixelLayoutSubSampled::I444 => w_mask_16bpc_inner(
dst_u16,
stride_u16,
&tmp1[..],
&tmp2[..],
w_u,
h_u,
&mut mask[..],
sign as u8,
bd_c,
false,
false,
),
}
}
}
}
true
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub fn mc_put_dispatch<BD: BitDepth>(
filter: Filter2d,
dst: PicOffset,
src: PicOffset,
w: i32,
h: i32,
mx: i32,
my: i32,
bd: BD,
) -> bool {
if dst.data.ref_eq(src.data) {
return false;
}
use zerocopy::IntoBytes;
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (mut dst_guard, dst_base) = dst.narrow_guard_mut::<BD>(w as usize, h as usize);
let dst_bytes = dst_guard.as_mut_bytes();
let dst_offset = dst_base * pixel_size;
let dst_stride = dst.stride();
mc_put_dispatch_inner::<BD>(
filter, dst_bytes, dst_offset, dst_stride, src, w, h, mx, my, bd,
)
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub(crate) fn mc_put_dispatch_inner<BD: BitDepth>(
filter: Filter2d,
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
src: PicOffset,
w: i32,
h: i32,
mx: i32,
my: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
use Filter2d::*;
#[cfg(feature = "asm")]
{
use zerocopy::IntoBytes;
#[allow(unsafe_code)]
{
let dst_ptr = unsafe { (dst_bytes.as_mut_ptr() as *mut DynPixel).add(dst_offset) };
let (src_guard, _src_base) = src.full_guard::<BD>();
let src_ptr = src_guard.as_bytes().as_ptr() as *const DynPixel;
let src_ptr = unsafe { src_ptr.add(_src_base * std::mem::size_of::<BD::Pixel>()) };
let src_stride = src.stride();
let bd_c = bd.into_c();
unsafe {
match (BD::BPC, filter) {
(BPC::BPC8, Regular8Tap) => put_8tap_regular_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, RegularSmooth8Tap) => put_8tap_regular_smooth_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, RegularSharp8Tap) => put_8tap_regular_sharp_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, SmoothRegular8Tap) => put_8tap_smooth_regular_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, Smooth8Tap) => put_8tap_smooth_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, SmoothSharp8Tap) => put_8tap_smooth_sharp_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, SharpRegular8Tap) => put_8tap_sharp_regular_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, SharpSmooth8Tap) => put_8tap_sharp_smooth_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, Sharp8Tap) => put_8tap_sharp_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC8, Bilinear) => put_bilin_8bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, Regular8Tap) => put_8tap_regular_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, RegularSmooth8Tap) => put_8tap_regular_smooth_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, RegularSharp8Tap) => put_8tap_regular_sharp_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, SmoothRegular8Tap) => put_8tap_smooth_regular_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, Smooth8Tap) => put_8tap_smooth_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, SmoothSharp8Tap) => put_8tap_smooth_sharp_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, SharpRegular8Tap) => put_8tap_sharp_regular_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, SharpSmooth8Tap) => put_8tap_sharp_smooth_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, Sharp8Tap) => put_8tap_sharp_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
(BPC::BPC16, Bilinear) => put_bilin_16bpc_neon(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bd_c,
std::ptr::null(),
std::ptr::null(),
),
}
}
}
}
#[cfg(not(feature = "asm"))]
{
let token = Arm64::summon().unwrap();
let w_u = w as usize;
let h_u = h as usize;
let mx_u = mx as usize;
let my_u = my as usize;
let (src_guard, src_base) = src.full_guard::<BD>();
let src_stride_raw = src.stride();
match BD::BPC {
BPC::BPC8 => {
use zerocopy::IntoBytes;
let dst_slice = &mut dst_bytes[dst_offset..];
let src_bytes = src_guard.as_bytes();
let dst_stride_u = dst_stride as usize;
let src_stride_u = src_stride_raw as usize;
if filter == Bilinear {
let src_slice = &src_bytes[src_base..];
put_bilin_8bpc_inner(
token,
dst_slice,
dst_stride_u,
src_slice,
src_stride_u,
w_u,
h_u,
mx,
my,
);
} else {
let src_start = src_base.wrapping_sub(3 * src_stride_u + 3);
let src_len = (h_u + 7) * src_stride_u + w_u + 7;
let src_full = &src_bytes[src_start..][..src_len];
let src_adj = &src_full[3 * src_stride_u + 3..];
put_8tap_8bpc_inner(
token,
dst_slice,
dst_stride_u,
src_adj,
src_stride_u,
w_u,
h_u,
mx_u,
my_u,
get_h_filter_type(filter),
get_v_filter_type(filter),
);
}
}
BPC::BPC16 => {
use zerocopy::{FromBytes, IntoBytes};
let src_bytes = src_guard.as_bytes();
let dst_stride_u16 = (dst_stride as usize) / 2;
let src_stride_u16 = (src_stride_raw as usize) / 2;
let dst_start = dst_offset;
let dst_byte_len = (h_u * dst_stride_u16 + w_u) * 2;
let dst_u16: &mut [u16] =
FromBytes::mut_from_bytes(&mut dst_bytes[dst_start..dst_start + dst_byte_len])
.unwrap();
if filter == Bilinear {
let src_start = src_base * 2;
let src_byte_len = ((h_u + 1) * src_stride_u16 + w_u + 1) * 2;
let src_u16: &[u16] =
FromBytes::ref_from_bytes(&src_bytes[src_start..src_start + src_byte_len])
.unwrap();
put_bilin_16bpc_inner(
token,
dst_u16,
dst_stride_u16,
src_u16,
src_stride_u16,
w_u,
h_u,
mx,
my,
bd.into_c(),
);
} else {
let src_start_u16 = src_base.wrapping_sub(3 * src_stride_u16 + 3);
let src_start = src_start_u16 * 2;
let src_len = (h_u + 7) * src_stride_u16 + w_u + 7;
let src_byte_len = src_len * 2;
let src_full: &[u16] =
FromBytes::ref_from_bytes(&src_bytes[src_start..src_start + src_byte_len])
.unwrap();
let src_adj = &src_full[3 * src_stride_u16 + 3..];
put_8tap_16bpc_inner(
token,
dst_u16,
dst_stride_u16,
src_adj,
src_stride_u16,
w_u,
h_u,
mx_u,
my_u,
get_h_filter_type(filter),
get_v_filter_type(filter),
bd.into_c() as u16,
);
}
}
}
}
true
}
#[cfg(target_arch = "aarch64")]
#[cfg_attr(not(feature = "asm"), allow(unused_variables))]
pub fn mct_prep_dispatch<BD: BitDepth>(
filter: Filter2d,
tmp: &mut [i16],
src: PicOffset,
w: i32,
h: i32,
mx: i32,
my: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
use Filter2d::*;
#[cfg(feature = "asm")]
{
#[allow(unsafe_code)]
{
let tmp_ptr = tmp[..(w * h) as usize].as_mut_ptr();
use zerocopy::IntoBytes;
let (src_guard, _src_base) = src.full_guard::<BD>();
let src_ptr = src_guard.as_bytes().as_ptr() as *const DynPixel;
let src_ptr = unsafe { src_ptr.add(_src_base * std::mem::size_of::<BD::Pixel>()) };
let src_stride = src.stride();
let bd_c = bd.into_c();
let src_ffi = FFISafe::new(&src);
unsafe {
match (BD::BPC, filter) {
(BPC::BPC8, Regular8Tap) => prep_8tap_regular_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, RegularSmooth8Tap) => prep_8tap_regular_smooth_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, RegularSharp8Tap) => prep_8tap_regular_sharp_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, SmoothRegular8Tap) => prep_8tap_smooth_regular_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, Smooth8Tap) => prep_8tap_smooth_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, SmoothSharp8Tap) => prep_8tap_smooth_sharp_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, SharpRegular8Tap) => prep_8tap_sharp_regular_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, SharpSmooth8Tap) => prep_8tap_sharp_smooth_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, Sharp8Tap) => prep_8tap_sharp_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC8, Bilinear) => prep_bilin_8bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, Regular8Tap) => prep_8tap_regular_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, RegularSmooth8Tap) => prep_8tap_regular_smooth_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, RegularSharp8Tap) => prep_8tap_regular_sharp_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, SmoothRegular8Tap) => prep_8tap_smooth_regular_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, Smooth8Tap) => prep_8tap_smooth_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, SmoothSharp8Tap) => prep_8tap_smooth_sharp_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, SharpRegular8Tap) => prep_8tap_sharp_regular_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, SharpSmooth8Tap) => prep_8tap_sharp_smooth_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, Sharp8Tap) => prep_8tap_sharp_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
(BPC::BPC16, Bilinear) => prep_bilin_16bpc_neon(
tmp_ptr, src_ptr, src_stride, w, h, mx, my, bd_c, src_ffi,
),
}
}
}
}
#[cfg(not(feature = "asm"))]
{
let token = Arm64::summon().unwrap();
let w_u = w as usize;
let h_u = h as usize;
let mx_u = mx as usize;
let my_u = my as usize;
let tmp_slice = &mut tmp[..(w_u * h_u)];
let (src_guard, src_base) = src.full_guard::<BD>();
let src_stride_raw = src.stride();
match BD::BPC {
BPC::BPC8 => {
use zerocopy::IntoBytes;
let src_bytes = src_guard.as_bytes();
let src_stride_u = src_stride_raw as usize;
if filter == Bilinear {
let src_slice = &src_bytes[src_base..];
prep_bilin_8bpc_inner(
token,
tmp_slice,
src_slice,
src_stride_u,
w_u,
h_u,
mx,
my,
);
} else {
let src_start = src_base.wrapping_sub(3 * src_stride_u + 3);
let src_len = (h_u + 7) * src_stride_u + w_u + 7;
let src_full = &src_bytes[src_start..][..src_len];
let src_adj = &src_full[3 * src_stride_u + 3..];
prep_8tap_8bpc_inner(
token,
tmp_slice,
src_adj,
src_stride_u,
w_u,
h_u,
mx_u,
my_u,
get_h_filter_type(filter),
get_v_filter_type(filter),
);
}
}
BPC::BPC16 => {
use zerocopy::{FromBytes, IntoBytes};
let src_bytes = src_guard.as_bytes();
let src_stride_u16 = (src_stride_raw as usize) / 2;
if filter == Bilinear {
let src_start = src_base * 2;
let src_byte_len = ((h_u + 1) * src_stride_u16 + w_u + 1) * 2;
let src_u16: &[u16] =
FromBytes::ref_from_bytes(&src_bytes[src_start..src_start + src_byte_len])
.unwrap();
prep_bilin_16bpc_inner(
token,
tmp_slice,
src_u16,
src_stride_u16,
w_u,
h_u,
mx,
my,
);
} else {
let src_start_u16 = src_base.wrapping_sub(3 * src_stride_u16 + 3);
let src_start = src_start_u16 * 2;
let src_len = (h_u + 7) * src_stride_u16 + w_u + 7;
let src_byte_len = src_len * 2;
let src_full: &[u16] =
FromBytes::ref_from_bytes(&src_bytes[src_start..src_start + src_byte_len])
.unwrap();
let src_adj = &src_full[3 * src_stride_u16 + 3..];
prep_8tap_16bpc_inner(
token,
tmp_slice,
src_adj,
src_stride_u16,
w_u,
h_u,
mx_u,
my_u,
get_h_filter_type(filter),
get_v_filter_type(filter),
);
}
}
}
}
true
}
#[cfg(target_arch = "aarch64")]
pub fn mc_scaled_dispatch<BD: BitDepth>(
_filter: Filter2d,
_dst: PicOffset,
_src: PicOffset,
_w: i32,
_h: i32,
_mx: i32,
_my: i32,
_dx: i32,
_dy: i32,
_bd: BD,
) -> bool {
false
}
#[cfg(target_arch = "aarch64")]
pub fn mct_scaled_dispatch<BD: BitDepth>(
_filter: Filter2d,
_tmp: &mut [i16],
_src: PicOffset,
_w: i32,
_h: i32,
_mx: i32,
_my: i32,
_dx: i32,
_dy: i32,
_bd: BD,
) -> bool {
false
}
#[cfg(target_arch = "aarch64")]
pub fn warp8x8_dispatch<BD: BitDepth>(
_dst: PicOffset,
_src: PicOffset,
_abcd: &[i16; 4],
_mx: i32,
_my: i32,
_bd: BD,
) -> bool {
false
}
#[cfg(target_arch = "aarch64")]
pub fn warp8x8t_dispatch<BD: BitDepth>(
_tmp: &mut [i16],
_tmp_stride: usize,
_src: PicOffset,
_abcd: &[i16; 4],
_mx: i32,
_my: i32,
_bd: BD,
) -> bool {
false
}
#[cfg(target_arch = "aarch64")]
pub fn emu_edge_dispatch<BD: BitDepth>(
_bw: isize,
_bh: isize,
_iw: isize,
_ih: isize,
_x: isize,
_y: isize,
_dst: &mut [BD::Pixel; crate::src::internal::EMU_EDGE_LEN],
_dst_pxstride: usize,
_src: &crate::include::dav1d::picture::Rav1dPictureDataComponent,
) -> bool {
false
}
#[cfg(target_arch = "aarch64")]
pub fn resize_dispatch<BD: BitDepth>(
_dst: crate::src::with_offset::WithOffset<
crate::src::pic_or_buf::PicOrBuf<crate::src::align::AlignedVec64<u8>>,
>,
_src: PicOffset,
_dst_w: usize,
_h: usize,
_src_w: usize,
_dx: i32,
_mx: i32,
_bd: BD,
) -> bool {
false
}