#![allow(deprecated)] #![allow(dead_code)]
#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
use crate::src::safe_simd::pixel_access::Flex;
#[cfg(target_arch = "x86_64")]
use crate::src::safe_simd::pixel_access::{
loadi64, loadu_128, loadu_256, loadu_512, storeu_128, storeu_256, storeu_512,
};
#[cfg(target_arch = "x86_64")]
use archmage::{Desktop64, Server64, arcane, rite};
#[cfg(target_arch = "x86_64")]
use crate::include::common::bitdepth::BitDepth;
use crate::include::dav1d::headers::Rav1dFilterMode;
#[cfg(target_arch = "x86_64")]
use crate::include::dav1d::headers::Rav1dPixelLayoutSubSampled;
#[cfg(target_arch = "x86_64")]
use crate::include::dav1d::picture::PicOffset;
#[cfg(target_arch = "x86_64")]
use crate::src::internal::COMPINTER_LEN;
#[cfg(target_arch = "x86_64")]
use crate::src::levels::Filter2d;
#[cfg(target_arch = "x86_64")]
use crate::src::strided::Strided as _;
use std::cell::Cell;
type Mid16x135 = Box<[[i16; MID_STRIDE]; 135]>;
type Mid32x135 = Box<[[i32; MID_STRIDE]; 135]>;
type Mid16x130 = Box<[[i16; MID_STRIDE]; 130]>;
type Mid32x130 = Box<[[i32; MID_STRIDE]; 130]>;
thread_local! {
static MID_I16_135: Cell<Option<Mid16x135>> = const { Cell::new(None) };
static MID_I32_135: Cell<Option<Mid32x135>> = const { Cell::new(None) };
static MID_I16_130: Cell<Option<Mid16x130>> = const { Cell::new(None) };
static MID_I32_130: Cell<Option<Mid32x130>> = const { Cell::new(None) };
}
#[inline(always)]
fn take_mid_i16_135() -> Mid16x135 {
MID_I16_135
.with(|c| c.take())
.unwrap_or_else(|| Box::new([[0; MID_STRIDE]; 135]))
}
#[inline(always)]
fn put_mid_i16_135(mid: Mid16x135) {
MID_I16_135.with(|c| c.set(Some(mid)));
}
#[inline(always)]
fn take_mid_i32_135() -> Mid32x135 {
MID_I32_135
.with(|c| c.take())
.unwrap_or_else(|| Box::new([[0; MID_STRIDE]; 135]))
}
#[inline(always)]
fn put_mid_i32_135(mid: Mid32x135) {
MID_I32_135.with(|c| c.set(Some(mid)));
}
#[inline(always)]
fn take_mid_i16_130() -> Mid16x130 {
MID_I16_130
.with(|c| c.take())
.unwrap_or_else(|| Box::new([[0; MID_STRIDE]; 130]))
}
#[inline(always)]
fn put_mid_i16_130(mid: Mid16x130) {
MID_I16_130.with(|c| c.set(Some(mid)));
}
#[inline(always)]
fn take_mid_i32_130() -> Mid32x130 {
MID_I32_130
.with(|c| c.take())
.unwrap_or_else(|| Box::new([[0; MID_STRIDE]; 130]))
}
#[inline(always)]
fn put_mid_i32_130(mid: Mid32x130) {
MID_I32_130.with(|c| c.set(Some(mid)));
}
const PW_1024: i16 = 1024;
#[cfg(target_arch = "x86_64")]
#[arcane]
fn avg_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
) {
let mut dst = dst.flex_mut();
let w = w as usize;
let h = h as usize;
let round = _mm256_set1_epi16(PW_1024);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 32 <= w {
let t1_lo = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t1_hi = loadu_256!(&tmp1_row[col + 16..col + 32], [i16; 16]);
let t2_lo = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let t2_hi = loadu_256!(&tmp2_row[col + 16..col + 32], [i16; 16]);
let sum_lo = _mm256_add_epi16(t1_lo, t2_lo);
let sum_hi = _mm256_add_epi16(t1_hi, t2_hi);
let avg_lo = _mm256_mulhrs_epi16(sum_lo, round);
let avg_hi = _mm256_mulhrs_epi16(sum_hi, round);
let packed = _mm256_packus_epi16(avg_lo, avg_hi);
let result = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_256!(&mut dst_row[col..col + 32], [u8; 32], result);
col += 32;
}
while col + 16 <= w {
let t1 = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t2 = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let sum = _mm256_add_epi16(t1, t2);
let avg = _mm256_mulhrs_epi16(sum, round);
let packed = _mm256_packus_epi16(avg, avg);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256(packed, 1);
let result = _mm_unpacklo_epi64(lo, hi);
storeu_128!(&mut dst_row[col..col + 16], [u8; 16], result);
col += 16;
}
while col < w {
let sum = tmp1_row[col].wrapping_add(tmp2_row[col]);
let avg = ((sum as i32 * 1024 + 16384) >> 15).clamp(0, 255) as u8;
dst_row[col] = avg;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn avg_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
avg_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn avg_8bpc_avx512_safe(
_token: Server64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
) {
let mut dst = dst.flex_mut();
let w = w as usize;
let h = h as usize;
let round = _mm512_set1_epi16(PW_1024);
let zero = _mm512_setzero_si512();
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 64 <= w {
let t1_lo = loadu_512!(&tmp1_row[col..col + 32], [i16; 32]);
let t2_lo = loadu_512!(&tmp2_row[col..col + 32], [i16; 32]);
let sum_lo = _mm512_add_epi16(t1_lo, t2_lo);
let avg_lo = _mm512_mulhrs_epi16(sum_lo, round);
let avg_lo = _mm512_max_epi16(avg_lo, zero); let result_lo: __m256i = _mm512_cvtusepi16_epi8(avg_lo);
let t1_hi = loadu_512!(&tmp1_row[col + 32..col + 64], [i16; 32]);
let t2_hi = loadu_512!(&tmp2_row[col + 32..col + 64], [i16; 32]);
let sum_hi = _mm512_add_epi16(t1_hi, t2_hi);
let avg_hi = _mm512_mulhrs_epi16(sum_hi, round);
let avg_hi = _mm512_max_epi16(avg_hi, zero);
let result_hi: __m256i = _mm512_cvtusepi16_epi8(avg_hi);
let combined = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(result_lo), result_hi);
storeu_512!(&mut dst_row[col..col + 64], [u8; 64], combined);
col += 64;
}
while col + 32 <= w {
let t1 = loadu_512!(&tmp1_row[col..col + 32], [i16; 32]);
let t2 = loadu_512!(&tmp2_row[col..col + 32], [i16; 32]);
let sum = _mm512_add_epi16(t1, t2);
let avg = _mm512_mulhrs_epi16(sum, round);
let avg = _mm512_max_epi16(avg, zero);
let result: __m256i = _mm512_cvtusepi16_epi8(avg);
storeu_256!(&mut dst_row[col..col + 32], [u8; 32], result);
col += 32;
}
let round_256 = _mm256_set1_epi16(PW_1024);
while col + 16 <= w {
let t1 = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t2 = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let sum = _mm256_add_epi16(t1, t2);
let avg = _mm256_mulhrs_epi16(sum, round_256);
let packed = _mm256_packus_epi16(avg, avg);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256(packed, 1);
let result = _mm_unpacklo_epi64(lo, hi);
storeu_128!(&mut dst_row[col..col + 16], [u8; 16], result);
col += 16;
}
while col < w {
let sum = tmp1_row[col].wrapping_add(tmp2_row[col]);
let avg = ((sum as i32 * 1024 + 16384) >> 15).clamp(0, 255) as u8;
dst_row[col] = avg;
col += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn avg_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let w = w as usize;
let h = h as usize;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
let sh = intermediate_bits + 1;
let rnd = (1 << intermediate_bits) + 8192 * 2; let max = bitdepth_max as i32;
let rnd_vec = _mm256_set1_epi32(rnd);
let zero = _mm256_setzero_si256();
let max_vec = _mm256_set1_epi32(max);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row_bytes = &mut dst[row * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let mut col = 0usize;
while col + 16 <= w {
let t1 = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t2 = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let t1_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(t1));
let t2_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(t2));
let t1_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(t1, 1));
let t2_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(t2, 1));
let sum_lo = _mm256_add_epi32(_mm256_add_epi32(t1_lo, t2_lo), rnd_vec);
let sum_hi = _mm256_add_epi32(_mm256_add_epi32(t1_hi, t2_hi), rnd_vec);
let (result_lo, result_hi) = if sh == 3 {
(_mm256_srai_epi32(sum_lo, 3), _mm256_srai_epi32(sum_hi, 3))
} else {
(_mm256_srai_epi32(sum_lo, 5), _mm256_srai_epi32(sum_hi, 5))
};
let clamped_lo = _mm256_min_epi32(_mm256_max_epi32(result_lo, zero), max_vec);
let clamped_hi = _mm256_min_epi32(_mm256_max_epi32(result_hi, zero), max_vec);
let packed = _mm256_packus_epi32(clamped_lo, clamped_hi);
let packed = _mm256_permute4x64_epi64(packed, 0b11011000);
storeu_256!(&mut dst_row[col..col + 16], [u16; 16], packed);
col += 16;
}
while col < w {
let sum = tmp1_row[col] as i32 + tmp2_row[col] as i32;
let val = ((sum + rnd) >> sh).clamp(0, max) as u16;
dst_row[col] = val;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn avg_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
avg_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
bitdepth_max,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn avg_16bpc_avx512_safe(
_token: Server64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let w = w as usize;
let h = h as usize;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
let sh = intermediate_bits + 1;
let rnd = (1 << intermediate_bits) + 8192 * 2;
let max = bitdepth_max as i32;
let rnd_vec = _mm512_set1_epi32(rnd);
let zero = _mm512_setzero_si512();
let max_vec = _mm512_set1_epi32(max);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row_bytes = &mut dst[row * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let mut col = 0usize;
while col + 32 <= w {
let t1_full = loadu_512!(&tmp1_row[col..col + 32], [i16; 32]);
let t2_full = loadu_512!(&tmp2_row[col..col + 32], [i16; 32]);
let t1_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(t1_full));
let t2_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(t2_full));
let sum_lo = _mm512_add_epi32(_mm512_add_epi32(t1_lo, t2_lo), rnd_vec);
let result_lo = if sh == 3 {
_mm512_srai_epi32::<3>(sum_lo)
} else {
_mm512_srai_epi32::<5>(sum_lo)
};
let clamped_lo = _mm512_min_epi32(_mm512_max_epi32(result_lo, zero), max_vec);
let packed_lo: __m256i = _mm512_cvtusepi32_epi16(clamped_lo);
let t1_hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(t1_full));
let t2_hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(t2_full));
let sum_hi = _mm512_add_epi32(_mm512_add_epi32(t1_hi, t2_hi), rnd_vec);
let result_hi = if sh == 3 {
_mm512_srai_epi32::<3>(sum_hi)
} else {
_mm512_srai_epi32::<5>(sum_hi)
};
let clamped_hi = _mm512_min_epi32(_mm512_max_epi32(result_hi, zero), max_vec);
let packed_hi: __m256i = _mm512_cvtusepi32_epi16(clamped_hi);
let combined = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(packed_lo), packed_hi);
storeu_512!(&mut dst_row[col..col + 32], [u16; 32], combined);
col += 32;
}
while col + 16 <= w {
let t1 = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t2 = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let t1_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(t1));
let t2_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(t2));
let t1_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(t1, 1));
let t2_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(t2, 1));
let rnd_256 = _mm256_set1_epi32(rnd);
let sum_lo = _mm256_add_epi32(_mm256_add_epi32(t1_lo, t2_lo), rnd_256);
let sum_hi = _mm256_add_epi32(_mm256_add_epi32(t1_hi, t2_hi), rnd_256);
let (result_lo, result_hi) = if sh == 3 {
(_mm256_srai_epi32(sum_lo, 3), _mm256_srai_epi32(sum_hi, 3))
} else {
(_mm256_srai_epi32(sum_lo, 5), _mm256_srai_epi32(sum_hi, 5))
};
let zero_256 = _mm256_setzero_si256();
let max_256 = _mm256_set1_epi32(max);
let clamped_lo = _mm256_min_epi32(_mm256_max_epi32(result_lo, zero_256), max_256);
let clamped_hi = _mm256_min_epi32(_mm256_max_epi32(result_hi, zero_256), max_256);
let packed = _mm256_packus_epi32(clamped_lo, clamped_hi);
let packed = _mm256_permute4x64_epi64(packed, 0b11011000);
storeu_256!(&mut dst_row[col..col + 16], [u16; 16], packed);
col += 16;
}
while col < w {
let sum = tmp1_row[col] as i32 + tmp2_row[col] as i32;
let val = ((sum + rnd) >> sh).clamp(0, max) as u16;
dst_row[col] = val;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "sse4.1")]
pub unsafe extern "C" fn avg_8bpc_sse4(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
unsafe { avg_scalar(dst_ptr, dst_stride, tmp1, tmp2, w, h, bitdepth_max, _dst) }
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
pub unsafe extern "C" fn avg_scalar(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst = dst_ptr as *mut u8;
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row =
unsafe { std::slice::from_raw_parts_mut(dst.offset(row as isize * dst_stride), w) };
for col in 0..w {
let sum = tmp1_row[col].wrapping_add(tmp2_row[col]);
let avg = ((sum as i32 * 1024 + 16384) >> 15).clamp(0, 255) as u8;
dst_row[col] = avg;
}
}
}
const PW_2048: i16 = 2048;
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_avg_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
) {
let mut dst = dst.flex_mut();
let w = w as usize;
let h = h as usize;
let (tmp1_ptr, tmp2_ptr, weight_scaled) = if weight > 7 {
(tmp1, tmp2, ((weight - 16) << 12) as i16)
} else {
(tmp2, tmp1, ((-weight) << 12) as i16)
};
let weight_vec = _mm256_set1_epi16(weight_scaled);
let round = _mm256_set1_epi16(PW_2048);
for row in 0..h {
let tmp1_row = &tmp1_ptr[row * w..][..w];
let tmp2_row = &tmp2_ptr[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 32 <= w {
let t1_lo = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t1_hi = loadu_256!(&tmp1_row[col + 16..col + 32], [i16; 16]);
let t2_lo = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let t2_hi = loadu_256!(&tmp2_row[col + 16..col + 32], [i16; 16]);
let diff_lo = _mm256_sub_epi16(t1_lo, t2_lo);
let diff_hi = _mm256_sub_epi16(t1_hi, t2_hi);
let scaled_lo = _mm256_mulhi_epi16(diff_lo, weight_vec);
let scaled_hi = _mm256_mulhi_epi16(diff_hi, weight_vec);
let sum_lo = _mm256_add_epi16(t1_lo, scaled_lo);
let sum_hi = _mm256_add_epi16(t1_hi, scaled_hi);
let avg_lo = _mm256_mulhrs_epi16(sum_lo, round);
let avg_hi = _mm256_mulhrs_epi16(sum_hi, round);
let packed = _mm256_packus_epi16(avg_lo, avg_hi);
let result = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
storeu_256!(&mut dst_row[col..col + 32], [u8; 32], result);
col += 32;
}
while col < w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let diff = a - b;
let scaled = (diff * (weight_scaled as i32)) >> 16;
let sum = a + scaled;
let avg = ((sum + 8) >> 4).clamp(0, 255) as u8;
dst_row[col] = avg;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn w_avg_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
w_avg_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
weight,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_avg_8bpc_avx512_safe(
_token: Server64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
) {
let mut dst = dst.flex_mut();
let w = w as usize;
let h = h as usize;
let (tmp1_ptr, tmp2_ptr, weight_scaled) = if weight > 7 {
(tmp1, tmp2, ((weight - 16) << 12) as i16)
} else {
(tmp2, tmp1, ((-weight) << 12) as i16)
};
let weight_vec = _mm512_set1_epi16(weight_scaled);
let round = _mm512_set1_epi16(PW_2048);
let zero = _mm512_setzero_si512();
for row in 0..h {
let tmp1_row = &tmp1_ptr[row * w..][..w];
let tmp2_row = &tmp2_ptr[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0;
while col + 64 <= w {
let t1_lo = loadu_512!(&tmp1_row[col..col + 32], [i16; 32]);
let t2_lo = loadu_512!(&tmp2_row[col..col + 32], [i16; 32]);
let diff_lo = _mm512_sub_epi16(t1_lo, t2_lo);
let scaled_lo = _mm512_mulhi_epi16(diff_lo, weight_vec);
let sum_lo = _mm512_add_epi16(t1_lo, scaled_lo);
let avg_lo = _mm512_mulhrs_epi16(sum_lo, round);
let avg_lo = _mm512_max_epi16(avg_lo, zero);
let result_lo: __m256i = _mm512_cvtusepi16_epi8(avg_lo);
let t1_hi = loadu_512!(&tmp1_row[col + 32..col + 64], [i16; 32]);
let t2_hi = loadu_512!(&tmp2_row[col + 32..col + 64], [i16; 32]);
let diff_hi = _mm512_sub_epi16(t1_hi, t2_hi);
let scaled_hi = _mm512_mulhi_epi16(diff_hi, weight_vec);
let sum_hi = _mm512_add_epi16(t1_hi, scaled_hi);
let avg_hi = _mm512_mulhrs_epi16(sum_hi, round);
let avg_hi = _mm512_max_epi16(avg_hi, zero);
let result_hi: __m256i = _mm512_cvtusepi16_epi8(avg_hi);
let combined = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(result_lo), result_hi);
storeu_512!(&mut dst_row[col..col + 64], [u8; 64], combined);
col += 64;
}
while col + 32 <= w {
let t1 = loadu_512!(&tmp1_row[col..col + 32], [i16; 32]);
let t2 = loadu_512!(&tmp2_row[col..col + 32], [i16; 32]);
let diff = _mm512_sub_epi16(t1, t2);
let scaled = _mm512_mulhi_epi16(diff, weight_vec);
let sum = _mm512_add_epi16(t1, scaled);
let avg = _mm512_mulhrs_epi16(sum, round);
let avg = _mm512_max_epi16(avg, zero);
let result: __m256i = _mm512_cvtusepi16_epi8(avg);
storeu_256!(&mut dst_row[col..col + 32], [u8; 32], result);
col += 32;
}
while col < w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let diff = a - b;
let scaled = (diff * (weight_scaled as i32)) >> 16;
let sum = a + scaled;
let avg = ((sum + 8) >> 4).clamp(0, 255) as u8;
dst_row[col] = avg;
col += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_avg_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let w = w as usize;
let h = h as usize;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
let sh = intermediate_bits + 4;
let rnd = (8 << intermediate_bits) + 8192 * 16; let max = bitdepth_max as i32;
let inv_weight = 16 - weight;
let rnd_vec = _mm256_set1_epi32(rnd);
let zero = _mm256_setzero_si256();
let max_vec = _mm256_set1_epi32(max);
let weight_vec = _mm256_set1_epi32(weight);
let inv_weight_vec = _mm256_set1_epi32(inv_weight);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row_bytes = &mut dst[row * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let mut col = 0usize;
while col + 8 <= w {
let t1_16 = loadu_128!(&tmp1_row[col..col + 8], [i16; 8]);
let t2_16 = loadu_128!(&tmp2_row[col..col + 8], [i16; 8]);
let t1 = _mm256_cvtepi16_epi32(t1_16);
let t2 = _mm256_cvtepi16_epi32(t2_16);
let term1 = _mm256_mullo_epi32(t1, weight_vec);
let term2 = _mm256_mullo_epi32(t2, inv_weight_vec);
let sum = _mm256_add_epi32(_mm256_add_epi32(term1, term2), rnd_vec);
let result = if sh == 6 {
_mm256_srai_epi32(sum, 6)
} else {
_mm256_srai_epi32(sum, 8)
};
let clamped = _mm256_min_epi32(_mm256_max_epi32(result, zero), max_vec);
let packed = _mm256_packus_epi32(clamped, clamped);
let lo128 = _mm256_castsi256_si128(packed);
let hi128 = _mm256_extracti128_si256(packed, 1);
let result_128 = _mm_unpacklo_epi64(lo128, hi128);
storeu_128!(&mut dst_row[col..col + 8], [u16; 8], result_128);
col += 8;
}
while col < w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let val = (a * weight + b * inv_weight + rnd) >> sh;
dst_row[col] = val.clamp(0, max) as u16;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn w_avg_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
w_avg_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
weight,
bitdepth_max,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_avg_16bpc_avx512_safe(
_token: Server64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let w = w as usize;
let h = h as usize;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
let sh = intermediate_bits + 4;
let rnd = (8 << intermediate_bits) + 8192 * 16;
let max = bitdepth_max as i32;
let inv_weight = 16 - weight;
let rnd_vec = _mm512_set1_epi32(rnd);
let zero = _mm512_setzero_si512();
let max_vec = _mm512_set1_epi32(max);
let weight_vec = _mm512_set1_epi32(weight);
let inv_weight_vec = _mm512_set1_epi32(inv_weight);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row_bytes = &mut dst[row * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let mut col = 0usize;
while col + 16 <= w {
let t1 = _mm512_cvtepi16_epi32(loadu_256!(&tmp1_row[col..col + 16], [i16; 16]));
let t2 = _mm512_cvtepi16_epi32(loadu_256!(&tmp2_row[col..col + 16], [i16; 16]));
let term1 = _mm512_mullo_epi32(t1, weight_vec);
let term2 = _mm512_mullo_epi32(t2, inv_weight_vec);
let sum = _mm512_add_epi32(_mm512_add_epi32(term1, term2), rnd_vec);
let result = if sh == 6 {
_mm512_srai_epi32::<6>(sum)
} else {
_mm512_srai_epi32::<8>(sum)
};
let clamped = _mm512_min_epi32(_mm512_max_epi32(result, zero), max_vec);
let packed: __m256i = _mm512_cvtusepi32_epi16(clamped);
storeu_256!(&mut dst_row[col..col + 16], [u16; 16], packed);
col += 16;
}
while col < w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let val = (a * weight + b * inv_weight + rnd) >> sh;
dst_row[col] = val.clamp(0, max) as u16;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
pub unsafe extern "C" fn w_avg_scalar(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst = dst_ptr as *mut u8;
let intermediate_bits = 4;
let sh = intermediate_bits + 4;
let rnd = (8 << intermediate_bits) + 0 * 16;
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let dst_row =
unsafe { std::slice::from_raw_parts_mut(dst.offset(row as isize * dst_stride), w) };
for col in 0..w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let val = (a * weight + b * (16 - weight) + rnd) >> sh;
dst_row[col] = val.clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn mask_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &[u8],
) {
let mut dst = dst.flex_mut();
let mask = mask.flex();
let w = w as usize;
let h = h as usize;
let rnd = _mm256_set1_epi32(512);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0usize;
while col + 16 <= w {
let t1_lo = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t2_lo = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let mask_bytes = loadu_128!(&mask_row[col..col + 16], [u8; 16]);
let mask_lo = _mm256_cvtepu8_epi16(mask_bytes);
let diff = _mm256_sub_epi16(t1_lo, t2_lo);
let diff_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(diff));
let mask_lo_32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(mask_lo));
let t2_lo_32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(t2_lo));
let prod_lo = _mm256_mullo_epi32(diff_lo, mask_lo_32);
let t2_64_lo = _mm256_slli_epi32(t2_lo_32, 6);
let sum_lo = _mm256_add_epi32(_mm256_add_epi32(prod_lo, t2_64_lo), rnd);
let diff_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(diff, 1));
let mask_hi_32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(mask_lo, 1));
let t2_hi_32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(t2_lo, 1));
let prod_hi = _mm256_mullo_epi32(diff_hi, mask_hi_32);
let t2_64_hi = _mm256_slli_epi32(t2_hi_32, 6);
let sum_hi = _mm256_add_epi32(_mm256_add_epi32(prod_hi, t2_64_hi), rnd);
let result_lo = _mm256_srai_epi32(sum_lo, 10);
let result_hi = _mm256_srai_epi32(sum_hi, 10);
let result_16 = _mm256_packs_epi32(result_lo, result_hi);
let result_16 = _mm256_permute4x64_epi64(result_16, 0b11011000);
let result_8 = _mm256_packus_epi16(result_16, result_16);
let result_8 = _mm256_permute4x64_epi64(result_8, 0b11011000);
storeu_128!(
&mut dst_row[col..col + 16],
[u8; 16],
_mm256_castsi256_si128(result_8)
);
col += 16;
}
while col < w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let m = mask_row[col] as i32;
let val = (a * m + b * (64 - m) + 512) >> 10;
dst_row[col] = val.clamp(0, 255) as u8;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn mask_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
let mask = unsafe { std::slice::from_raw_parts(mask_ptr, (w * h) as usize) };
mask_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn mask_8bpc_avx512_safe(
_token: Server64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &[u8],
) {
let mut dst = dst.flex_mut();
let mask = mask.flex();
let w = w as usize;
let h = h as usize;
let rnd = _mm512_set1_epi32(512);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let dst_row = &mut dst[row * dst_stride..][..w];
let mut col = 0usize;
while col + 32 <= w {
let t1_full = loadu_512!(&tmp1_row[col..col + 32], [i16; 32]);
let t2_full = loadu_512!(&tmp2_row[col..col + 32], [i16; 32]);
let mask_bytes = loadu_256!(&mask_row[col..col + 32], [u8; 32]);
let mask_16 = _mm512_cvtepu8_epi16(mask_bytes);
let diff = _mm512_sub_epi16(t1_full, t2_full);
let diff_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(diff));
let mask_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(mask_16));
let t2_lo = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(t2_full));
let prod_lo = _mm512_mullo_epi32(diff_lo, mask_lo);
let t2_64_lo = _mm512_slli_epi32(t2_lo, 6);
let sum_lo = _mm512_add_epi32(_mm512_add_epi32(prod_lo, t2_64_lo), rnd);
let result_lo = _mm512_srai_epi32::<10>(sum_lo);
let diff_hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(diff));
let mask_hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(mask_16));
let t2_hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(t2_full));
let prod_hi = _mm512_mullo_epi32(diff_hi, mask_hi);
let t2_64_hi = _mm512_slli_epi32(t2_hi, 6);
let sum_hi = _mm512_add_epi32(_mm512_add_epi32(prod_hi, t2_64_hi), rnd);
let result_hi = _mm512_srai_epi32::<10>(sum_hi);
let result_16 = _mm512_packs_epi32(result_lo, result_hi);
let perm_idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
let result_16 = _mm512_permutexvar_epi64(perm_idx, result_16);
let result_8: __m256i =
_mm512_cvtusepi16_epi8(_mm512_max_epi16(result_16, _mm512_setzero_si512()));
storeu_256!(&mut dst_row[col..col + 32], [u8; 32], result_8);
col += 32;
}
while col + 16 <= w {
let t1_lo = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t2_lo = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let mask_bytes = loadu_128!(&mask_row[col..col + 16], [u8; 16]);
let mask_lo = _mm256_cvtepu8_epi16(mask_bytes);
let diff = _mm256_sub_epi16(t1_lo, t2_lo);
let diff_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(diff));
let mask_lo_32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(mask_lo));
let t2_lo_32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(t2_lo));
let prod_lo = _mm256_mullo_epi32(diff_lo, mask_lo_32);
let t2_64_lo = _mm256_slli_epi32(t2_lo_32, 6);
let sum_lo =
_mm256_add_epi32(_mm256_add_epi32(prod_lo, t2_64_lo), _mm256_set1_epi32(512));
let diff_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(diff, 1));
let mask_hi_32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(mask_lo, 1));
let t2_hi_32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(t2_lo, 1));
let prod_hi = _mm256_mullo_epi32(diff_hi, mask_hi_32);
let t2_64_hi = _mm256_slli_epi32(t2_hi_32, 6);
let sum_hi =
_mm256_add_epi32(_mm256_add_epi32(prod_hi, t2_64_hi), _mm256_set1_epi32(512));
let result_lo = _mm256_srai_epi32(sum_lo, 10);
let result_hi = _mm256_srai_epi32(sum_hi, 10);
let result_16 = _mm256_packs_epi32(result_lo, result_hi);
let result_16 = _mm256_permute4x64_epi64(result_16, 0b11011000);
let result_8 = _mm256_packus_epi16(result_16, result_16);
let result_8 = _mm256_permute4x64_epi64(result_8, 0b11011000);
storeu_128!(
&mut dst_row[col..col + 16],
[u8; 16],
_mm256_castsi256_si128(result_8)
);
col += 16;
}
while col < w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let m = mask_row[col] as i32;
let val = (a * m + b * (64 - m) + 512) >> 10;
dst_row[col] = val.clamp(0, 255) as u8;
col += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn mask_16bpc_avx512_safe(
_token: Server64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &[u8],
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mask = mask.flex();
let w = w as usize;
let h = h as usize;
let max = bitdepth_max as i32;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
let sh = intermediate_bits + 6;
let rnd = (32 << intermediate_bits) + 8192 * 64;
let rnd_vec = _mm512_set1_epi32(rnd);
let zero = _mm512_setzero_si512();
let max_vec = _mm512_set1_epi32(max);
let sixty_four = _mm512_set1_epi32(64);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let dst_row_bytes = &mut dst[row * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let mut col = 0usize;
while col + 16 <= w {
let t1_16 = loadu_256!(&tmp1_row[col..col + 16], [i16; 16]);
let t2_16 = loadu_256!(&tmp2_row[col..col + 16], [i16; 16]);
let t1 = _mm512_cvtepi16_epi32(t1_16);
let t2 = _mm512_cvtepi16_epi32(t2_16);
let mask_128 = loadu_128!(&mask_row[col..col + 16], [u8; 16]);
let mask_32 = _mm512_cvtepu8_epi32(mask_128);
let inv_mask = _mm512_sub_epi32(sixty_four, mask_32);
let term1 = _mm512_mullo_epi32(t1, mask_32);
let term2 = _mm512_mullo_epi32(t2, inv_mask);
let sum = _mm512_add_epi32(_mm512_add_epi32(term1, term2), rnd_vec);
let result = if sh == 8 {
_mm512_srai_epi32::<8>(sum)
} else {
_mm512_srai_epi32::<10>(sum)
};
let clamped = _mm512_min_epi32(_mm512_max_epi32(result, zero), max_vec);
let packed = _mm512_cvtusepi32_epi16(clamped);
storeu_256!(&mut dst_row[col..col + 16], [u16; 16], packed);
col += 16;
}
while col + 8 <= w {
let t1_16 = loadu_128!(&tmp1_row[col..col + 8], [i16; 8]);
let t2_16 = loadu_128!(&tmp2_row[col..col + 8], [i16; 8]);
let t1 = _mm256_cvtepi16_epi32(t1_16);
let t2 = _mm256_cvtepi16_epi32(t2_16);
let mut mask_pad = [0u8; 16];
mask_pad[..8].copy_from_slice(&mask_row[col..col + 8]);
let mask_bytes = loadu_128!(&mask_pad);
let mask_32 = _mm256_cvtepu8_epi32(mask_bytes);
let inv_mask_256 = _mm256_sub_epi32(_mm256_set1_epi32(64), mask_32);
let term1_256 = _mm256_mullo_epi32(t1, mask_32);
let term2_256 = _mm256_mullo_epi32(t2, inv_mask_256);
let sum_256 = _mm256_add_epi32(
_mm256_add_epi32(term1_256, term2_256),
_mm256_set1_epi32(rnd),
);
let result_256 = if sh == 8 {
_mm256_srai_epi32(sum_256, 8)
} else {
_mm256_srai_epi32(sum_256, 10)
};
let clamped_256 = _mm256_min_epi32(
_mm256_max_epi32(result_256, _mm256_setzero_si256()),
_mm256_set1_epi32(max),
);
let packed = _mm256_packus_epi32(clamped_256, clamped_256);
let lo128 = _mm256_castsi256_si128(packed);
let hi128 = _mm256_extracti128_si256(packed, 1);
let result_128 = _mm_unpacklo_epi64(lo128, hi128);
storeu_128!(&mut dst_row[col..col + 8], [u16; 8], result_128);
col += 8;
}
while col < w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let m = mask_row[col] as i32;
let val = (a * m + b * (64 - m) + rnd) >> sh;
dst_row[col] = val.clamp(0, max) as u16;
col += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn mask_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &[u8],
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mask = mask.flex();
let w = w as usize;
let h = h as usize;
let max = bitdepth_max as i32;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
let sh = intermediate_bits + 6;
let rnd = (32 << intermediate_bits) + 8192 * 64;
let rnd_vec = _mm256_set1_epi32(rnd);
let zero = _mm256_setzero_si256();
let max_vec = _mm256_set1_epi32(max);
let sixty_four = _mm256_set1_epi32(64);
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let dst_row_bytes = &mut dst[row * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let mut col = 0usize;
while col + 8 <= w {
let t1_16 = loadu_128!(&tmp1_row[col..col + 8], [i16; 8]);
let t2_16 = loadu_128!(&tmp2_row[col..col + 8], [i16; 8]);
let t1 = _mm256_cvtepi16_epi32(t1_16);
let t2 = _mm256_cvtepi16_epi32(t2_16);
let mut mask_pad = [0u8; 16];
mask_pad[..8].copy_from_slice(&mask_row[col..col + 8]);
let mask_bytes = loadu_128!(&mask_pad);
let mask_32 = _mm256_cvtepu8_epi32(mask_bytes);
let inv_mask = _mm256_sub_epi32(sixty_four, mask_32);
let term1 = _mm256_mullo_epi32(t1, mask_32);
let term2 = _mm256_mullo_epi32(t2, inv_mask);
let sum = _mm256_add_epi32(_mm256_add_epi32(term1, term2), rnd_vec);
let result = if sh == 8 {
_mm256_srai_epi32(sum, 8)
} else {
_mm256_srai_epi32(sum, 10)
};
let clamped = _mm256_min_epi32(_mm256_max_epi32(result, zero), max_vec);
let packed = _mm256_packus_epi32(clamped, clamped);
let lo128 = _mm256_castsi256_si128(packed);
let hi128 = _mm256_extracti128_si256(packed, 1);
let result_128 = _mm_unpacklo_epi64(lo128, hi128);
storeu_128!(&mut dst_row[col..col + 8], [u16; 8], result_128);
col += 8;
}
while col < w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let m = mask_row[col] as i32;
let val = (a * m + b * (64 - m) + rnd) >> sh;
dst_row[col] = val.clamp(0, max) as u16;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn mask_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
let mask = unsafe { std::slice::from_raw_parts(mask_ptr, (w * h) as usize) };
mask_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
bitdepth_max,
)
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
pub unsafe extern "C" fn mask_scalar(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let w = w as usize;
let h = h as usize;
let dst = dst_ptr as *mut u8;
let intermediate_bits = 4;
let sh = intermediate_bits + 6;
let rnd = (32 << intermediate_bits) + 0 * 64;
for row in 0..h {
let tmp1_row = &tmp1[row * w..][..w];
let tmp2_row = &tmp2[row * w..][..w];
let mask_row = unsafe { std::slice::from_raw_parts(mask_ptr.add(row * w), w) };
let dst_row =
unsafe { std::slice::from_raw_parts_mut(dst.offset(row as isize * dst_stride), w) };
for col in 0..w {
let a = tmp1_row[col] as i32;
let b = tmp2_row[col] as i32;
let m = mask_row[col] as i32;
let val = (a * m + b * (64 - m) + rnd) >> sh;
dst_row[col] = val.clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "x86_64")]
use crate::src::internal::{SCRATCH_INTER_INTRA_BUF_LEN, SCRATCH_LAP_LEN};
#[cfg(target_arch = "x86_64")]
#[arcane]
fn blend_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[u8],
w: i32,
h: i32,
mask: &[u8],
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let mask = mask.flex();
let w = w as usize;
let h = h as usize;
let sixty_four = _mm256_set1_epi16(64);
let rnd = _mm256_set1_epi16(32);
for row in 0..h {
let dst_row = &mut dst[row * dst_stride..][..w];
let tmp_row = &tmp[row * w..][..w];
let mask_row = &mask[row * w..][..w];
let mut col = 0usize;
while col + 16 <= w {
let dst_bytes = loadu_128!(&dst_row[col..col + 16], [u8; 16]);
let tmp_bytes = loadu_128!(&tmp_row[col..col + 16], [u8; 16]);
let mask_bytes = loadu_128!(&mask_row[col..col + 16], [u8; 16]);
let dst_16 = _mm256_cvtepu8_epi16(dst_bytes);
let tmp_16 = _mm256_cvtepu8_epi16(tmp_bytes);
let mask_16 = _mm256_cvtepu8_epi16(mask_bytes);
let inv_mask = _mm256_sub_epi16(sixty_four, mask_16);
let term1 = _mm256_mullo_epi16(dst_16, inv_mask);
let term2 = _mm256_mullo_epi16(tmp_16, mask_16);
let sum = _mm256_add_epi16(_mm256_add_epi16(term1, term2), rnd);
let result_16 = _mm256_srli_epi16(sum, 6);
let result_8 = _mm256_packus_epi16(result_16, result_16);
let result_8 = _mm256_permute4x64_epi64(result_8, 0b11011000);
storeu_128!(
&mut dst_row[col..col + 16],
[u8; 16],
_mm256_castsi256_si128(result_8)
);
col += 16;
}
while col < w {
let a = dst_row[col] as u32;
let b = tmp_row[col] as u32;
let m = mask_row[col] as u32;
let val = (a * (64 - m) + b * m + 32) >> 6;
dst_row[col] = val as u8;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn blend_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_INTER_INTRA_BUF_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
let tmp_slice = unsafe { std::slice::from_raw_parts(tmp as *const u8, (w * h) as usize) };
let mask = unsafe { std::slice::from_raw_parts(mask_ptr, (w * h) as usize) };
blend_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp_slice,
w,
h,
mask,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn blend_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[u8],
w: i32,
h: i32,
mask: &[u8],
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let mask = mask.flex();
let w = w as usize;
let h = h as usize;
let rnd = _mm256_set1_epi32(32);
let sixty_four = _mm256_set1_epi32(64);
for row in 0..h {
let dst_row_bytes = &mut dst[row * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let tmp_row_bytes = &tmp[row * w * 2..][..w * 2];
let tmp_row: &[u16] =
<[u16] as zerocopy::FromBytes>::ref_from_bytes(tmp_row_bytes).unwrap();
let mask_row = &mask[row * w..][..w];
let mut col = 0usize;
while col + 8 <= w {
let dst_16 = loadu_128!(&dst_row[col..col + 8], [u16; 8]);
let tmp_16 = loadu_128!(&tmp_row[col..col + 8], [u16; 8]);
let dst_32 = _mm256_cvtepu16_epi32(dst_16);
let tmp_32 = _mm256_cvtepu16_epi32(tmp_16);
let mut mask_pad = [0u8; 16];
mask_pad[..8].copy_from_slice(&mask_row[col..col + 8]);
let mask_bytes = loadu_128!(&mask_pad);
let mask_32 = _mm256_cvtepu8_epi32(mask_bytes);
let inv_mask = _mm256_sub_epi32(sixty_four, mask_32);
let term1 = _mm256_mullo_epi32(dst_32, inv_mask);
let term2 = _mm256_mullo_epi32(tmp_32, mask_32);
let sum = _mm256_add_epi32(_mm256_add_epi32(term1, term2), rnd);
let result = _mm256_srli_epi32(sum, 6);
let packed = _mm256_packus_epi32(result, result);
let lo128 = _mm256_castsi256_si128(packed);
let hi128 = _mm256_extracti128_si256(packed, 1);
let result_128 = _mm_unpacklo_epi64(lo128, hi128);
storeu_128!(&mut dst_row[col..col + 8], [u16; 8], result_128);
col += 8;
}
while col < w {
let a = dst_row[col] as u32;
let b = tmp_row[col] as u32;
let m = mask_row[col] as u32;
let val = (a * (64 - m) + b * m + 32) >> 6;
dst_row[col] = val as u16;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn blend_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_INTER_INTRA_BUF_LEN],
w: i32,
h: i32,
mask_ptr: *const u8,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
let tmp_slice = unsafe { std::slice::from_raw_parts(tmp as *const u8, (w * h) as usize * 2) };
let mask = unsafe { std::slice::from_raw_parts(mask_ptr, (w * h) as usize) };
blend_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp_slice,
w,
h,
mask,
)
}
use crate::src::tables::dav1d_mc_subpel_filters;
#[cfg(target_arch = "x86_64")]
use crate::src::tables::dav1d_obmc_masks;
#[cfg(target_arch = "x86_64")]
#[arcane]
fn blend_v_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[u8],
w: i32,
h: i32,
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let w = w as usize;
let h = h as usize;
let obmc_mask = &dav1d_obmc_masks[w..];
let w_eff = w * 3 >> 2;
let rnd = _mm256_set1_epi16(32);
let sixty_four = _mm256_set1_epi16(64);
for row in 0..h {
let dst_row = &mut dst[row * dst_stride..][..w_eff];
let tmp_row = &tmp[row * w..][..w_eff];
let mut col = 0usize;
while col + 16 <= w_eff {
let dst_bytes = loadu_128!(&dst_row[col..col + 16], [u8; 16]);
let tmp_bytes = loadu_128!(&tmp_row[col..col + 16], [u8; 16]);
let mask_bytes = loadu_128!(&obmc_mask[col..col + 16], [u8; 16]);
let dst_16 = _mm256_cvtepu8_epi16(dst_bytes);
let tmp_16 = _mm256_cvtepu8_epi16(tmp_bytes);
let mask_16 = _mm256_cvtepu8_epi16(mask_bytes);
let inv_mask = _mm256_sub_epi16(sixty_four, mask_16);
let term1 = _mm256_mullo_epi16(dst_16, inv_mask);
let term2 = _mm256_mullo_epi16(tmp_16, mask_16);
let sum = _mm256_add_epi16(_mm256_add_epi16(term1, term2), rnd);
let result_16 = _mm256_srli_epi16(sum, 6);
let result_8 = _mm256_packus_epi16(result_16, result_16);
let result_8 = _mm256_permute4x64_epi64(result_8, 0b11011000);
storeu_128!(
&mut dst_row[col..col + 16],
[u8; 16],
_mm256_castsi256_si128(result_8)
);
col += 16;
}
while col < w_eff {
let a = dst_row[col] as u32;
let b = tmp_row[col] as u32;
let m = obmc_mask[col] as u32;
let val = (a * (64 - m) + b * m + 32) >> 6;
dst_row[col] = val as u8;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn blend_v_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
let tmp_slice = unsafe { std::slice::from_raw_parts(tmp as *const u8, (w * h) as usize) };
blend_v_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp_slice,
w,
h,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn blend_h_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[u8],
w: i32,
h: i32,
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let w = w as usize;
let h = h as usize;
let obmc_mask = &dav1d_obmc_masks[h..];
let h_eff = h * 3 >> 2;
let rnd = _mm256_set1_epi16(32);
let sixty_four = _mm256_set1_epi16(64);
for row in 0..h_eff {
let dst_row = &mut dst[row * dst_stride..][..w];
let tmp_row = &tmp[row * w..][..w];
let m = obmc_mask[row];
let mask_16 = _mm256_set1_epi16(m as i16);
let inv_mask = _mm256_sub_epi16(sixty_four, mask_16);
let mut col = 0usize;
while col + 16 <= w {
let dst_bytes = loadu_128!(&dst_row[col..col + 16], [u8; 16]);
let tmp_bytes = loadu_128!(&tmp_row[col..col + 16], [u8; 16]);
let dst_16 = _mm256_cvtepu8_epi16(dst_bytes);
let tmp_16 = _mm256_cvtepu8_epi16(tmp_bytes);
let term1 = _mm256_mullo_epi16(dst_16, inv_mask);
let term2 = _mm256_mullo_epi16(tmp_16, mask_16);
let sum = _mm256_add_epi16(_mm256_add_epi16(term1, term2), rnd);
let result_16 = _mm256_srli_epi16(sum, 6);
let result_8 = _mm256_packus_epi16(result_16, result_16);
let result_8 = _mm256_permute4x64_epi64(result_8, 0b11011000);
storeu_128!(
&mut dst_row[col..col + 16],
[u8; 16],
_mm256_castsi256_si128(result_8)
);
col += 16;
}
while col < w {
let a = dst_row[col] as u32;
let b = tmp_row[col] as u32;
let val = (a * (64 - m as u32) + b * m as u32 + 32) >> 6;
dst_row[col] = val as u8;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn blend_h_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
let tmp_slice = unsafe { std::slice::from_raw_parts(tmp as *const u8, (w * h) as usize) };
blend_h_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp_slice,
w,
h,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn blend_v_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[u8],
w: i32,
h: i32,
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let w = w as usize;
let h = h as usize;
let obmc_mask = &dav1d_obmc_masks[w..];
let w_eff = w * 3 >> 2;
let rnd = _mm256_set1_epi32(32);
let sixty_four = _mm256_set1_epi32(64);
for row in 0..h {
let dst_row_bytes = &mut dst[row * dst_stride..][..w_eff * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let tmp_row_bytes = &tmp[row * w * 2..][..w_eff * 2];
let tmp_row: &[u16] =
<[u16] as zerocopy::FromBytes>::ref_from_bytes(tmp_row_bytes).unwrap();
let mut col = 0usize;
while col + 8 <= w_eff {
let dst_16 = loadu_128!(&dst_row[col..col + 8], [u16; 8]);
let tmp_16 = loadu_128!(&tmp_row[col..col + 8], [u16; 8]);
let dst_32 = _mm256_cvtepu16_epi32(dst_16);
let tmp_32 = _mm256_cvtepu16_epi32(tmp_16);
let mut mask_pad = [0u8; 16];
mask_pad[..8].copy_from_slice(&obmc_mask[col..col + 8]);
let mask_bytes = loadu_128!(&mask_pad);
let mask_32 = _mm256_cvtepu8_epi32(mask_bytes);
let inv_mask = _mm256_sub_epi32(sixty_four, mask_32);
let term1 = _mm256_mullo_epi32(dst_32, inv_mask);
let term2 = _mm256_mullo_epi32(tmp_32, mask_32);
let sum = _mm256_add_epi32(_mm256_add_epi32(term1, term2), rnd);
let result = _mm256_srli_epi32(sum, 6);
let packed = _mm256_packus_epi32(result, result);
let lo128 = _mm256_castsi256_si128(packed);
let hi128 = _mm256_extracti128_si256(packed, 1);
let result_128 = _mm_unpacklo_epi64(lo128, hi128);
storeu_128!(&mut dst_row[col..col + 8], [u16; 8], result_128);
col += 8;
}
while col < w_eff {
let a = dst_row[col] as u32;
let b = tmp_row[col] as u32;
let m = obmc_mask[col] as u32;
let val = (a * (64 - m) + b * m + 32) >> 6;
dst_row[col] = val as u16;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn blend_v_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
let tmp_slice = unsafe { std::slice::from_raw_parts(tmp as *const u8, (w * h) as usize * 2) };
blend_v_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp_slice,
w,
h,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn blend_h_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp: &[u8],
w: i32,
h: i32,
) {
let mut dst = dst.flex_mut();
let tmp = tmp.flex();
let w = w as usize;
let h = h as usize;
let obmc_mask = &dav1d_obmc_masks[h..];
let h_eff = h * 3 >> 2;
let rnd = _mm256_set1_epi32(32);
let sixty_four = _mm256_set1_epi32(64);
for row in 0..h_eff {
let dst_row_bytes = &mut dst[row * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let tmp_row_bytes = &tmp[row * w * 2..][..w * 2];
let tmp_row: &[u16] =
<[u16] as zerocopy::FromBytes>::ref_from_bytes(tmp_row_bytes).unwrap();
let m = obmc_mask[row] as u32;
let mask_32 = _mm256_set1_epi32(m as i32);
let inv_mask = _mm256_sub_epi32(sixty_four, mask_32);
let mut col = 0usize;
while col + 8 <= w {
let dst_16 = loadu_128!(&dst_row[col..col + 8], [u16; 8]);
let tmp_16 = loadu_128!(&tmp_row[col..col + 8], [u16; 8]);
let dst_32 = _mm256_cvtepu16_epi32(dst_16);
let tmp_32 = _mm256_cvtepu16_epi32(tmp_16);
let term1 = _mm256_mullo_epi32(dst_32, inv_mask);
let term2 = _mm256_mullo_epi32(tmp_32, mask_32);
let sum = _mm256_add_epi32(_mm256_add_epi32(term1, term2), rnd);
let result = _mm256_srli_epi32(sum, 6);
let packed = _mm256_packus_epi32(result, result);
let lo128 = _mm256_castsi256_si128(packed);
let hi128 = _mm256_extracti128_si256(packed, 1);
let result_128 = _mm_unpacklo_epi64(lo128, hi128);
storeu_128!(&mut dst_row[col..col + 8], [u16; 8], result_128);
col += 8;
}
while col < w {
let a = dst_row[col] as u32;
let b = tmp_row[col] as u32;
let val = (a * (64 - m) + b * m + 32) >> 6;
dst_row[col] = val as u16;
col += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn blend_h_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp: *const [DynPixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
let tmp_slice = unsafe { std::slice::from_raw_parts(tmp as *const u8, (w * h) as usize * 2) };
blend_h_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp_slice,
w,
h,
)
}
const MID_STRIDE: usize = 128;
#[inline]
fn get_filter(m: usize, d: usize, filter_idx: usize) -> Option<&'static [i8; 8]> {
if m == 0 {
return None;
}
let m = m - 1;
let i = if d > 4 {
filter_idx
} else {
3 + (filter_idx & 1)
};
Some(&dav1d_mc_subpel_filters[i][m])
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [i16],
src: &[u8],
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff_01 = _mm256_set1_epi16(((filter[1] as u8 as i16) << 8) | (filter[0] as u8 as i16));
let coeff_23 = _mm256_set1_epi16(((filter[3] as u8 as i16) << 8) | (filter[2] as u8 as i16));
let coeff_45 = _mm256_set1_epi16(((filter[5] as u8 as i16) << 8) | (filter[4] as u8 as i16));
let coeff_67 = _mm256_set1_epi16(((filter[7] as u8 as i16) << 8) | (filter[6] as u8 as i16));
let rnd = _mm256_set1_epi16((1i16 << sh) >> 1);
let mut col = 0usize;
while col + 16 <= w {
let src_0_15 = loadu_128!(<&[u8; 16]>::try_from(&src[col..col + 16]).unwrap());
let src_1_16 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 1..col + 17]).unwrap());
let src_2_17 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 2..col + 18]).unwrap());
let src_3_18 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 3..col + 19]).unwrap());
let src_4_19 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 4..col + 20]).unwrap());
let src_5_20 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 5..col + 21]).unwrap());
let src_6_21 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 6..col + 22]).unwrap());
let src_7_22 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 7..col + 23]).unwrap());
let p01_lo = _mm_unpacklo_epi8(src_0_15, src_1_16);
let p01_hi = _mm_unpackhi_epi8(src_0_15, src_1_16);
let p01 = _mm256_set_m128i(p01_hi, p01_lo);
let p23_lo = _mm_unpacklo_epi8(src_2_17, src_3_18);
let p23_hi = _mm_unpackhi_epi8(src_2_17, src_3_18);
let p23 = _mm256_set_m128i(p23_hi, p23_lo);
let p45_lo = _mm_unpacklo_epi8(src_4_19, src_5_20);
let p45_hi = _mm_unpackhi_epi8(src_4_19, src_5_20);
let p45 = _mm256_set_m128i(p45_hi, p45_lo);
let p67_lo = _mm_unpacklo_epi8(src_6_21, src_7_22);
let p67_hi = _mm_unpackhi_epi8(src_6_21, src_7_22);
let p67 = _mm256_set_m128i(p67_hi, p67_lo);
let ma01 = _mm256_maddubs_epi16(p01, coeff_01);
let ma23 = _mm256_maddubs_epi16(p23, coeff_23);
let ma45 = _mm256_maddubs_epi16(p45, coeff_45);
let ma67 = _mm256_maddubs_epi16(p67, coeff_67);
let mut sum = _mm256_add_epi16(ma01, ma23);
sum = _mm256_add_epi16(sum, ma45);
sum = _mm256_add_epi16(sum, ma67);
let shift_count = _mm_cvtsi32_si128(sh as i32);
let result = _mm256_sra_epi16(_mm256_add_epi16(sum, rnd), shift_count);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
result
);
col += 16;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
dst[col] = ((sum + ((1 << sh) >> 1)) >> sh) as i16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_8bpc_avx512_inner(
_token: Server64,
dst: &mut [i16],
src: &[u8],
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff_01 = _mm512_set1_epi16(((filter[1] as u8 as i16) << 8) | (filter[0] as u8 as i16));
let coeff_23 = _mm512_set1_epi16(((filter[3] as u8 as i16) << 8) | (filter[2] as u8 as i16));
let coeff_45 = _mm512_set1_epi16(((filter[5] as u8 as i16) << 8) | (filter[4] as u8 as i16));
let coeff_67 = _mm512_set1_epi16(((filter[7] as u8 as i16) << 8) | (filter[6] as u8 as i16));
let rnd = _mm512_set1_epi16((1i16 << sh) >> 1);
let mut col = 0usize;
while col + 32 <= w {
let s0 = loadu_256!(<&[u8; 32]>::try_from(&src[col..col + 32]).unwrap());
let s1 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 1..col + 33]).unwrap());
let s2 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 2..col + 34]).unwrap());
let s3 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 3..col + 35]).unwrap());
let s4 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 4..col + 36]).unwrap());
let s5 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 5..col + 37]).unwrap());
let s6 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 6..col + 38]).unwrap());
let s7 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 7..col + 39]).unwrap());
let lo01 = _mm256_unpacklo_epi8(s0, s1);
let hi01 = _mm256_unpackhi_epi8(s0, s1);
let p01 = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(lo01), hi01);
let lo23 = _mm256_unpacklo_epi8(s2, s3);
let hi23 = _mm256_unpackhi_epi8(s2, s3);
let p23 = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(lo23), hi23);
let lo45 = _mm256_unpacklo_epi8(s4, s5);
let hi45 = _mm256_unpackhi_epi8(s4, s5);
let p45 = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(lo45), hi45);
let lo67 = _mm256_unpacklo_epi8(s6, s7);
let hi67 = _mm256_unpackhi_epi8(s6, s7);
let p67 = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(lo67), hi67);
let ma01 = _mm512_maddubs_epi16(p01, coeff_01);
let ma23 = _mm512_maddubs_epi16(p23, coeff_23);
let ma45 = _mm512_maddubs_epi16(p45, coeff_45);
let ma67 = _mm512_maddubs_epi16(p67, coeff_67);
let sum = _mm512_add_epi16(_mm512_add_epi16(ma01, ma23), _mm512_add_epi16(ma45, ma67));
let shift_count = _mm_cvtsi32_si128(sh as i32);
let result = _mm512_sra_epi16(_mm512_add_epi16(sum, rnd), shift_count);
let result = _mm512_shuffle_i64x2::<0b11_01_10_00>(result, result);
storeu_512!(
<&mut [i16; 32]>::try_from(&mut dst[col..col + 32]).unwrap(),
result
);
col += 32;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
dst[col] = ((sum + ((1 << sh) >> 1)) >> sh) as i16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_filter_8tap_8bpc_avx2(
dst: *mut i16,
src: *const u8,
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_filter_8tap_8bpc_avx2_inner(token, dst, src, w, filter, sh) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
mid: &[[i16; MID_STRIDE]],
w: usize,
filter: &[i8; 8],
sh: u8,
max: i32,
) {
let mut dst = dst.flex_mut();
let rnd = _mm256_set1_epi32((1i32 << sh) >> 1);
let zero = _mm256_setzero_si256();
let _max_vec = _mm256_set1_epi16(max as i16);
let c0 = _mm256_set1_epi32(filter[0] as i32);
let c1 = _mm256_set1_epi32(filter[1] as i32);
let c2 = _mm256_set1_epi32(filter[2] as i32);
let c3 = _mm256_set1_epi32(filter[3] as i32);
let c4 = _mm256_set1_epi32(filter[4] as i32);
let c5 = _mm256_set1_epi32(filter[5] as i32);
let c6 = _mm256_set1_epi32(filter[6] as i32);
let c7 = _mm256_set1_epi32(filter[7] as i32);
let mut col = 0usize;
while col + 8 <= w {
let m0 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[0][col..col + 8]).unwrap()
));
let m1 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[1][col..col + 8]).unwrap()
));
let m2 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[2][col..col + 8]).unwrap()
));
let m3 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[3][col..col + 8]).unwrap()
));
let m4 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[4][col..col + 8]).unwrap()
));
let m5 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[5][col..col + 8]).unwrap()
));
let m6 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[6][col..col + 8]).unwrap()
));
let m7 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[7][col..col + 8]).unwrap()
));
let mut sum = _mm256_mullo_epi32(m0, c0);
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m1, c1));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m2, c2));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m3, c3));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m4, c4));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m5, c5));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m6, c6));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m7, c7));
let shift_count = _mm_cvtsi32_si128(sh as i32);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let clamped = _mm256_min_epi32(_mm256_max_epi32(shifted, zero), _mm256_set1_epi32(max));
let packed16 = _mm256_packs_epi32(clamped, clamped);
let packed16 = _mm256_permute4x64_epi64(packed16, 0b11011000);
let packed8 = _mm256_packus_epi16(packed16, packed16);
let result_64 = _mm256_extract_epi64(packed8, 0);
dst[col..col + 8].copy_from_slice(&result_64.to_ne_bytes());
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[i][col] as i32;
}
let val = ((sum + ((1 << sh) >> 1)) >> sh).clamp(0, max);
dst[col] = val as u8;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_filter_8tap_8bpc_avx2(
dst: *mut u8,
mid: &[[i16; MID_STRIDE]],
w: usize,
filter: &[i8; 8],
sh: u8,
max: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_filter_8tap_8bpc_avx2_inner(token, dst, mid, w, filter, sh, max) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
mid: &[[i16; MID_STRIDE]],
w: usize,
filter: &[i8; 8],
sh: u8,
max: i32,
) {
let mut dst = dst.flex_mut();
let rnd = _mm512_set1_epi32((1i32 << sh) >> 1);
let zero = _mm512_setzero_si512();
let max_v = _mm512_set1_epi32(max);
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let mut col = 0usize;
while col + 16 <= w {
let m0 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[0][col..col + 16]).unwrap()
));
let m1 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[1][col..col + 16]).unwrap()
));
let m2 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[2][col..col + 16]).unwrap()
));
let m3 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[3][col..col + 16]).unwrap()
));
let m4 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[4][col..col + 16]).unwrap()
));
let m5 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[5][col..col + 16]).unwrap()
));
let m6 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[6][col..col + 16]).unwrap()
));
let m7 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[7][col..col + 16]).unwrap()
));
let mut sum = _mm512_mullo_epi32(m0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m7, c7));
let shift_count = _mm_cvtsi32_si128(sh as i32);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let clamped = _mm512_min_epi32(_mm512_max_epi32(shifted, zero), max_v);
let packed = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[i][col] as i32;
}
let val = ((sum + ((1 << sh) >> 1)) >> sh).clamp(0, max);
dst[col] = val as u8;
col += 1;
}
}
#[inline]
fn get_filter_coeff(m: usize, d: usize, filter_type: Rav1dFilterMode) -> Option<&'static [i8; 8]> {
let m = m.checked_sub(1)?;
let i = if d > 4 {
filter_type as u8
} else {
3 + (filter_type as u8 & 1)
};
Some(&dav1d_mc_subpel_filters[i as usize][m])
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_8bpc_put_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
src: &[u8], w: usize,
filter: &[i8; 8],
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff_01 = _mm256_set1_epi16(((filter[1] as u8 as i16) << 8) | (filter[0] as u8 as i16));
let coeff_23 = _mm256_set1_epi16(((filter[3] as u8 as i16) << 8) | (filter[2] as u8 as i16));
let coeff_45 = _mm256_set1_epi16(((filter[5] as u8 as i16) << 8) | (filter[4] as u8 as i16));
let coeff_67 = _mm256_set1_epi16(((filter[7] as u8 as i16) << 8) | (filter[6] as u8 as i16));
let rnd = _mm256_set1_epi16(34);
let zero = _mm256_setzero_si256();
let mut col = 0usize;
while col + 16 <= w {
let src_0_15 = loadu_128!(<&[u8; 16]>::try_from(&src[col..col + 16]).unwrap());
let src_1_16 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 1..col + 17]).unwrap());
let src_2_17 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 2..col + 18]).unwrap());
let src_3_18 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 3..col + 19]).unwrap());
let src_4_19 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 4..col + 20]).unwrap());
let src_5_20 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 5..col + 21]).unwrap());
let src_6_21 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 6..col + 22]).unwrap());
let src_7_22 = loadu_128!(<&[u8; 16]>::try_from(&src[col + 7..col + 23]).unwrap());
let p01_lo = _mm_unpacklo_epi8(src_0_15, src_1_16);
let p01_hi = _mm_unpackhi_epi8(src_0_15, src_1_16);
let p01 = _mm256_set_m128i(p01_hi, p01_lo);
let p23_lo = _mm_unpacklo_epi8(src_2_17, src_3_18);
let p23_hi = _mm_unpackhi_epi8(src_2_17, src_3_18);
let p23 = _mm256_set_m128i(p23_hi, p23_lo);
let p45_lo = _mm_unpacklo_epi8(src_4_19, src_5_20);
let p45_hi = _mm_unpackhi_epi8(src_4_19, src_5_20);
let p45 = _mm256_set_m128i(p45_hi, p45_lo);
let p67_lo = _mm_unpacklo_epi8(src_6_21, src_7_22);
let p67_hi = _mm_unpackhi_epi8(src_6_21, src_7_22);
let p67 = _mm256_set_m128i(p67_hi, p67_lo);
let ma01 = _mm256_maddubs_epi16(p01, coeff_01);
let ma23 = _mm256_maddubs_epi16(p23, coeff_23);
let ma45 = _mm256_maddubs_epi16(p45, coeff_45);
let ma67 = _mm256_maddubs_epi16(p67, coeff_67);
let mut sum = _mm256_add_epi16(ma01, ma23);
sum = _mm256_add_epi16(sum, ma45);
sum = _mm256_add_epi16(sum, ma67);
let shift_count = _mm_cvtsi32_si128(6);
let shifted = _mm256_sra_epi16(_mm256_add_epi16(sum, rnd), shift_count);
let clamped = _mm256_max_epi16(_mm256_min_epi16(shifted, _mm256_set1_epi16(255)), zero);
let packed = _mm256_packus_epi16(clamped, clamped);
let packed = _mm256_permute4x64_epi64(packed, 0b11011000);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
_mm256_castsi256_si128(packed)
);
col += 16;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
let val = ((sum + 34) >> 6).clamp(0, 255);
dst[col] = val as u8;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_8bpc_put_avx512_inner(
_token: Server64,
dst: &mut [u8],
src: &[u8],
w: usize,
filter: &[i8; 8],
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff_01 = _mm512_set1_epi16(((filter[1] as u8 as i16) << 8) | (filter[0] as u8 as i16));
let coeff_23 = _mm512_set1_epi16(((filter[3] as u8 as i16) << 8) | (filter[2] as u8 as i16));
let coeff_45 = _mm512_set1_epi16(((filter[5] as u8 as i16) << 8) | (filter[4] as u8 as i16));
let coeff_67 = _mm512_set1_epi16(((filter[7] as u8 as i16) << 8) | (filter[6] as u8 as i16));
let rnd = _mm512_set1_epi16(34);
let zero = _mm512_setzero_si512();
let max_v = _mm512_set1_epi16(255);
let mut col = 0usize;
while col + 32 <= w {
let s0 = loadu_256!(<&[u8; 32]>::try_from(&src[col..col + 32]).unwrap());
let s1 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 1..col + 33]).unwrap());
let s2 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 2..col + 34]).unwrap());
let s3 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 3..col + 35]).unwrap());
let s4 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 4..col + 36]).unwrap());
let s5 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 5..col + 37]).unwrap());
let s6 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 6..col + 38]).unwrap());
let s7 = loadu_256!(<&[u8; 32]>::try_from(&src[col + 7..col + 39]).unwrap());
let lo01 = _mm256_unpacklo_epi8(s0, s1);
let hi01 = _mm256_unpackhi_epi8(s0, s1);
let p01 = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(lo01), hi01);
let lo23 = _mm256_unpacklo_epi8(s2, s3);
let hi23 = _mm256_unpackhi_epi8(s2, s3);
let p23 = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(lo23), hi23);
let lo45 = _mm256_unpacklo_epi8(s4, s5);
let hi45 = _mm256_unpackhi_epi8(s4, s5);
let p45 = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(lo45), hi45);
let lo67 = _mm256_unpacklo_epi8(s6, s7);
let hi67 = _mm256_unpackhi_epi8(s6, s7);
let p67 = _mm512_inserti64x4::<1>(_mm512_castsi256_si512(lo67), hi67);
let ma01 = _mm512_maddubs_epi16(p01, coeff_01);
let ma23 = _mm512_maddubs_epi16(p23, coeff_23);
let ma45 = _mm512_maddubs_epi16(p45, coeff_45);
let ma67 = _mm512_maddubs_epi16(p67, coeff_67);
let sum = _mm512_add_epi16(_mm512_add_epi16(ma01, ma23), _mm512_add_epi16(ma45, ma67));
let shift_count = _mm_cvtsi32_si128(6);
let shifted = _mm512_sra_epi16(_mm512_add_epi16(sum, rnd), shift_count);
let clamped = _mm512_max_epi16(_mm512_min_epi16(shifted, max_v), zero);
let clamped = _mm512_shuffle_i64x2::<0b11_01_10_00>(clamped, clamped);
let packed = _mm512_cvtusepi16_epi8(clamped);
storeu_256!(
<&mut [u8; 32]>::try_from(&mut dst[col..col + 32]).unwrap(),
packed
);
col += 32;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
let val = ((sum + 34) >> 6).clamp(0, 255);
dst[col] = val as u8;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_filter_8tap_8bpc_put_avx2(
dst: *mut u8,
src: *const u8, w: usize,
filter: &[i8; 8],
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
h_filter_8tap_8bpc_put_avx2_inner(
token, dst, src, w, filter,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_8bpc_direct_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
src: &[u8], src_stride: isize,
w: usize,
filter: &[i8; 8],
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let c0 = _mm256_set1_epi32(filter[0] as i32);
let c1 = _mm256_set1_epi32(filter[1] as i32);
let c2 = _mm256_set1_epi32(filter[2] as i32);
let c3 = _mm256_set1_epi32(filter[3] as i32);
let c4 = _mm256_set1_epi32(filter[4] as i32);
let c5 = _mm256_set1_epi32(filter[5] as i32);
let c6 = _mm256_set1_epi32(filter[6] as i32);
let c7 = _mm256_set1_epi32(filter[7] as i32);
let rnd = _mm256_set1_epi32(32);
let zero = _mm256_setzero_si256();
let max = _mm256_set1_epi32(255);
let mut col = 0usize;
while col + 8 <= w {
let p0 = _mm256_cvtepu8_epi32(loadi64!(&src[col..col + 8]));
let p1 = _mm256_cvtepu8_epi32(loadi64!(
&src[src_stride as usize + col..src_stride as usize + col + 8]
));
let p2 = _mm256_cvtepu8_epi32(loadi64!(
&src[2 * src_stride as usize + col..2 * src_stride as usize + col + 8]
));
let p3 = _mm256_cvtepu8_epi32(loadi64!(
&src[3 * src_stride as usize + col..3 * src_stride as usize + col + 8]
));
let p4 = _mm256_cvtepu8_epi32(loadi64!(
&src[4 * src_stride as usize + col..4 * src_stride as usize + col + 8]
));
let p5 = _mm256_cvtepu8_epi32(loadi64!(
&src[5 * src_stride as usize + col..5 * src_stride as usize + col + 8]
));
let p6 = _mm256_cvtepu8_epi32(loadi64!(
&src[6 * src_stride as usize + col..6 * src_stride as usize + col + 8]
));
let p7 = _mm256_cvtepu8_epi32(loadi64!(
&src[7 * src_stride as usize + col..7 * src_stride as usize + col + 8]
));
let mut sum = _mm256_mullo_epi32(p0, c0);
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(p1, c1));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(p2, c2));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(p3, c3));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(p4, c4));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(p5, c5));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(p6, c6));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(p7, c7));
let shift_count = _mm_cvtsi32_si128(6);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let clamped = _mm256_min_epi32(_mm256_max_epi32(shifted, zero), max);
let packed16 = _mm256_packs_epi32(clamped, clamped);
let packed16 = _mm256_permute4x64_epi64(packed16, 0b11011000);
let packed8 = _mm256_packus_epi16(packed16, packed16);
let result_64 = _mm256_extract_epi64(packed8, 0);
dst[col..col + 8].copy_from_slice(&result_64.to_ne_bytes());
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
let px = src[(i as isize * src_stride) as usize + col] as i32;
sum += filter[i] as i32 * px;
}
let val = ((sum + 32) >> 6).clamp(0, 255);
dst[col] = val as u8;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_8bpc_direct_avx512_inner(
_token: Server64,
dst: &mut [u8],
src: &[u8],
src_stride: isize,
w: usize,
filter: &[i8; 8],
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let rnd = _mm512_set1_epi32(32);
let zero = _mm512_setzero_si512();
let max = _mm512_set1_epi32(255);
let stride = src_stride as usize;
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu8_epi32(loadu_128!(
<&[u8; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu8_epi32(loadu_128!(
<&[u8; 16]>::try_from(&src[stride + col..stride + col + 16]).unwrap()
));
let p2 = _mm512_cvtepu8_epi32(loadu_128!(
<&[u8; 16]>::try_from(&src[2 * stride + col..2 * stride + col + 16]).unwrap()
));
let p3 = _mm512_cvtepu8_epi32(loadu_128!(
<&[u8; 16]>::try_from(&src[3 * stride + col..3 * stride + col + 16]).unwrap()
));
let p4 = _mm512_cvtepu8_epi32(loadu_128!(
<&[u8; 16]>::try_from(&src[4 * stride + col..4 * stride + col + 16]).unwrap()
));
let p5 = _mm512_cvtepu8_epi32(loadu_128!(
<&[u8; 16]>::try_from(&src[5 * stride + col..5 * stride + col + 16]).unwrap()
));
let p6 = _mm512_cvtepu8_epi32(loadu_128!(
<&[u8; 16]>::try_from(&src[6 * stride + col..6 * stride + col + 16]).unwrap()
));
let p7 = _mm512_cvtepu8_epi32(loadu_128!(
<&[u8; 16]>::try_from(&src[7 * stride + col..7 * stride + col + 16]).unwrap()
));
let mut sum = _mm512_mullo_epi32(p0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p7, c7));
let shift_count = _mm_cvtsi32_si128(6);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let clamped = _mm512_min_epi32(_mm512_max_epi32(shifted, zero), max);
let packed = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
let px = src[(i as isize * src_stride) as usize + col] as i32;
sum += filter[i] as i32 * px;
}
let val = ((sum + 32) >> 6).clamp(0, 255);
dst[col] = val as u8;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_filter_8tap_8bpc_direct_avx2(
dst: *mut u8,
src: *const u8, src_stride: isize,
w: usize,
filter: &[i8; 8],
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
v_filter_8tap_8bpc_direct_avx2_inner(
token, dst, src, src_stride, w, filter,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn put_8tap_8bpc_avx2_impl_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: isize,
src: &[u8],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let sb = src_base as isize;
let intermediate_bits = 4u8;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = take_mid_i16_135();
for y in 0..tmp_h {
let src_row_base = (sb + (y as isize - 3) * src_stride) as usize;
h_filter_8tap_8bpc_avx2_inner(
_token,
&mut mid[y],
&src[src_row_base - 3..], w,
fh,
6 - intermediate_bits,
);
}
for y in 0..h {
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
v_filter_8tap_8bpc_avx2_inner(
_token,
dst_row,
&mid[y..],
w,
fv,
6 + intermediate_bits,
255,
);
}
put_mid_i16_135(mid);
}
(Some(fh), None) => {
for y in 0..h {
let src_row_base = (sb + y as isize * src_stride) as usize;
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
h_filter_8tap_8bpc_put_avx2_inner(_token, dst_row, &src[src_row_base - 3..], w, fh);
}
}
(None, Some(fv)) => {
for y in 0..h {
let src_row_base = (sb + (y as isize - 3) * src_stride) as usize;
let src_row = &src[src_row_base..];
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
v_filter_8tap_8bpc_direct_avx2_inner(_token, dst_row, src_row, src_stride, w, fv);
}
}
(None, None) => {
for y in 0..h {
let src_row_base = (sb + y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
dst_row[..w].copy_from_slice(&src_row[..w]);
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn put_8tap_8bpc_avx512_impl_inner(
_token: Server64,
dst: &mut [u8],
dst_stride: isize,
src: &[u8],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let sb = src_base as isize;
let intermediate_bits = 4u8;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = take_mid_i16_135();
for y in 0..tmp_h {
let src_row_base = (sb + (y as isize - 3) * src_stride) as usize;
h_filter_8tap_8bpc_avx512_inner(
_token,
&mut mid[y],
&src[src_row_base - 3..],
w,
fh,
6 - intermediate_bits,
);
}
for y in 0..h {
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
v_filter_8tap_8bpc_avx512_inner(
_token,
dst_row,
&mid[y..],
w,
fv,
6 + intermediate_bits,
255,
);
}
put_mid_i16_135(mid);
}
(Some(fh), None) => {
for y in 0..h {
let src_row_base = (sb + y as isize * src_stride) as usize;
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
h_filter_8tap_8bpc_put_avx512_inner(
_token,
dst_row,
&src[src_row_base - 3..],
w,
fh,
);
}
}
(None, Some(fv)) => {
for y in 0..h {
let src_row_base = (sb + (y as isize - 3) * src_stride) as usize;
let src_row = &src[src_row_base..];
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
v_filter_8tap_8bpc_direct_avx512_inner(_token, dst_row, src_row, src_stride, w, fv);
}
}
(None, None) => {
for y in 0..h {
let src_row_base = (sb + y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
dst_row[..w].copy_from_slice(&src_row[..w]);
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn put_8tap_8bpc_avx2_impl(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_8bpc_avx2_impl_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
0,
src_stride,
w,
h,
mx,
my,
h_filter_type,
v_filter_type,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_8bpc_avx2<const FILTER: usize>(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
_src: *const FFISafe<PicOffset>,
) {
let filter = Filter2d::from_repr(FILTER).unwrap();
let (h_filter, v_filter) = filter.hv();
unsafe {
put_8tap_8bpc_avx2_impl(
dst_ptr, dst_stride, src_ptr, src_stride, w, h, mx, my, h_filter, v_filter,
);
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_regular_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::Regular8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_regular_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_regular_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_regular_smooth_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::RegularSmooth8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_regular_smooth_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_regular_smooth_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_regular_sharp_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::RegularSharp8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_regular_sharp_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_regular_sharp_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_smooth_regular_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::SmoothRegular8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_smooth_regular_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_smooth_regular_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_smooth_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::Smooth8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_smooth_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_smooth_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_smooth_sharp_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::SmoothSharp8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_smooth_sharp_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_smooth_sharp_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_sharp_regular_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::SharpRegular8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_sharp_regular_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_sharp_regular_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_sharp_smooth_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::SharpSmooth8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_sharp_smooth_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_sharp_smooth_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_sharp_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_8bpc_avx2::<{ Filter2d::Sharp8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_sharp_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_sharp_8bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn prep_8tap_8bpc_avx2_impl_inner(
_token: Desktop64,
tmp: &mut [i16],
src: &[u8],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let sb = src_base as isize;
let intermediate_bits = 4u8;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = take_mid_i16_135();
for y in 0..tmp_h {
let src_row_base = (sb + (y as isize - 3) * src_stride) as usize;
h_filter_8tap_8bpc_avx2_inner(
_token,
&mut mid[y],
&src[src_row_base - 3..],
w,
fh,
6 - intermediate_bits,
);
}
for y in 0..h {
let out_row = y * w;
v_filter_8tap_to_i16_avx2_inner(_token, &mid[y..], &mut tmp[out_row..], w, fv, 6);
}
put_mid_i16_135(mid);
}
(Some(fh), None) => {
for y in 0..h {
let src_row_base = (sb + y as isize * src_stride) as usize;
let out_row = y * w;
h_filter_8tap_8bpc_avx2_inner(
_token,
&mut tmp[out_row..],
&src[src_row_base - 3..],
w,
fh,
6 - intermediate_bits,
);
}
}
(None, Some(fv)) => {
for y in 0..h {
let out_row = y * w;
let mut mid = [[0i16; MID_STRIDE]; 8];
for i in 0..8 {
let src_row =
&src[(sb + (y as isize + i as isize - 3) * src_stride) as usize..];
for x in 0..w {
mid[i][x] = (src_row[x] as i16) << intermediate_bits;
}
}
v_filter_8tap_to_i16_avx2_inner(_token, &mid, &mut tmp[out_row..], w, fv, 6);
}
}
(None, None) => {
for y in 0..h {
let src_row_base = (sb + y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
let out_row = y * w;
for x in 0..w {
tmp[out_row + x] = (src_row[x] as i16) << intermediate_bits;
}
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn prep_8tap_8bpc_avx512_impl_inner(
_token: Server64,
tmp: &mut [i16],
src: &[u8],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let sb = src_base as isize;
let intermediate_bits = 4u8;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = take_mid_i16_135();
for y in 0..tmp_h {
let src_row_base = (sb + (y as isize - 3) * src_stride) as usize;
h_filter_8tap_8bpc_avx512_inner(
_token,
&mut mid[y],
&src[src_row_base - 3..],
w,
fh,
6 - intermediate_bits,
);
}
for y in 0..h {
let out_row = y * w;
v_filter_8tap_to_i16_avx512_inner(_token, &mid[y..], &mut tmp[out_row..], w, fv, 6);
}
put_mid_i16_135(mid);
}
(Some(fh), None) => {
for y in 0..h {
let src_row_base = (sb + y as isize * src_stride) as usize;
let out_row = y * w;
h_filter_8tap_8bpc_avx512_inner(
_token,
&mut tmp[out_row..],
&src[src_row_base - 3..],
w,
fh,
6 - intermediate_bits,
);
}
}
(None, Some(fv)) => {
for y in 0..h {
let out_row = y * w;
let mut mid = [[0i16; MID_STRIDE]; 8];
for i in 0..8 {
let src_row =
&src[(sb + (y as isize + i as isize - 3) * src_stride) as usize..];
for x in 0..w {
mid[i][x] = (src_row[x] as i16) << intermediate_bits;
}
}
v_filter_8tap_to_i16_avx512_inner(_token, &mid, &mut tmp[out_row..], w, fv, 6);
}
}
(None, None) => {
for y in 0..h {
let src_row_base = (sb + y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
let out_row = y * w;
for x in 0..w {
tmp[out_row + x] = (src_row[x] as i16) << intermediate_bits;
}
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn prep_8tap_8bpc_avx2_impl(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_8bpc_avx2_impl_inner(
token,
tmp,
src_ptr,
0,
src_stride,
w,
h,
mx,
my,
h_filter_type,
v_filter_type,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_to_i16_avx2_inner(
_token: Desktop64,
mid: &[[i16; MID_STRIDE]],
dst: &mut [i16],
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let mut dst = dst.flex_mut();
let rnd = _mm256_set1_epi32((1i32 << sh) >> 1);
let c0 = _mm256_set1_epi32(filter[0] as i32);
let c1 = _mm256_set1_epi32(filter[1] as i32);
let c2 = _mm256_set1_epi32(filter[2] as i32);
let c3 = _mm256_set1_epi32(filter[3] as i32);
let c4 = _mm256_set1_epi32(filter[4] as i32);
let c5 = _mm256_set1_epi32(filter[5] as i32);
let c6 = _mm256_set1_epi32(filter[6] as i32);
let c7 = _mm256_set1_epi32(filter[7] as i32);
let mut col = 0usize;
while col + 8 <= w {
let m0 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[0][col..col + 8]).unwrap()
));
let m1 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[1][col..col + 8]).unwrap()
));
let m2 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[2][col..col + 8]).unwrap()
));
let m3 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[3][col..col + 8]).unwrap()
));
let m4 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[4][col..col + 8]).unwrap()
));
let m5 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[5][col..col + 8]).unwrap()
));
let m6 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[6][col..col + 8]).unwrap()
));
let m7 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[7][col..col + 8]).unwrap()
));
let mut sum = _mm256_mullo_epi32(m0, c0);
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m1, c1));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m2, c2));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m3, c3));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m4, c4));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m5, c5));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m6, c6));
sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(m7, c7));
let shift_count = _mm_cvtsi32_si128(sh as i32);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let packed = _mm256_packs_epi32(shifted, shifted);
let packed = _mm256_permute4x64_epi64(packed, 0b11011000);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(packed)
);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[i][col] as i32;
}
dst[col] = ((sum + ((1 << sh) >> 1)) >> sh) as i16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_to_i16_avx512_inner(
_token: Server64,
mid: &[[i16; MID_STRIDE]],
dst: &mut [i16],
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let mut dst = dst.flex_mut();
let rnd = _mm512_set1_epi32((1i32 << sh) >> 1);
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let mut col = 0usize;
while col + 16 <= w {
let m0 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[0][col..col + 16]).unwrap()
));
let m1 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[1][col..col + 16]).unwrap()
));
let m2 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[2][col..col + 16]).unwrap()
));
let m3 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[3][col..col + 16]).unwrap()
));
let m4 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[4][col..col + 16]).unwrap()
));
let m5 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[5][col..col + 16]).unwrap()
));
let m6 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[6][col..col + 16]).unwrap()
));
let m7 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[7][col..col + 16]).unwrap()
));
let mut sum = _mm512_mullo_epi32(m0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(m7, c7));
let shift_count = _mm_cvtsi32_si128(sh as i32);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let packed = _mm512_cvtsepi32_epi16(shifted);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[i][col] as i32;
}
dst[col] = ((sum + ((1 << sh) >> 1)) >> sh) as i16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_filter_8tap_to_i16_avx2(
mid: &[[i16; MID_STRIDE]],
dst: *mut i16,
w: usize,
filter: &[i8; 8],
sh: u8,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_filter_8tap_to_i16_avx2_inner(token, mid, dst, w, filter, sh) }
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_8bpc_avx2<const FILTER: usize>(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_src: *const FFISafe<PicOffset>,
) {
let filter = Filter2d::from_repr(FILTER).unwrap();
let (h_filter, v_filter) = filter.hv();
unsafe {
prep_8tap_8bpc_avx2_impl(tmp, src_ptr, src_stride, w, h, mx, my, h_filter, v_filter);
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_regular_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::Regular8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_regular_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_regular_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_regular_smooth_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::RegularSmooth8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_regular_smooth_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_regular_smooth_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_regular_sharp_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::RegularSharp8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_regular_sharp_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_regular_sharp_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_smooth_regular_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::SmoothRegular8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_smooth_regular_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_smooth_regular_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_smooth_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::Smooth8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_smooth_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_smooth_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_smooth_sharp_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::SmoothSharp8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_smooth_sharp_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_smooth_sharp_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_sharp_regular_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::SharpRegular8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_sharp_regular_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_sharp_regular_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_sharp_smooth_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::SharpSmooth8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_sharp_smooth_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_sharp_smooth_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_sharp_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_8bpc_avx2::<{ Filter2d::Sharp8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_sharp_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_sharp_8bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_16bpc_avx2_inner(
_token: Desktop64,
dst: &mut [i32],
src: &[u16],
w: usize,
filter: &[i8; 8],
sh: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff0 =
_mm256_set1_epi32(((filter[1] as i16 as i32) << 16) | (filter[0] as i16 as u16 as i32));
let coeff2 =
_mm256_set1_epi32(((filter[3] as i16 as i32) << 16) | (filter[2] as i16 as u16 as i32));
let coeff4 =
_mm256_set1_epi32(((filter[5] as i16 as i32) << 16) | (filter[4] as i16 as u16 as i32));
let coeff6 =
_mm256_set1_epi32(((filter[7] as i16 as i32) << 16) | (filter[6] as i16 as u16 as i32));
let rnd = _mm256_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let mut col = 0usize;
while col + 8 <= w {
let a0 = loadu_128!(<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap());
let a1 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 1..col + 9]).unwrap());
let a2 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 2..col + 10]).unwrap());
let a3 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 3..col + 11]).unwrap());
let a4 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 4..col + 12]).unwrap());
let a5 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 5..col + 13]).unwrap());
let a6 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 6..col + 14]).unwrap());
let a7 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 7..col + 15]).unwrap());
let a8 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 8..col + 16]).unwrap());
let a9 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 9..col + 17]).unwrap());
let a10 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 10..col + 18]).unwrap());
let a11 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 11..col + 19]).unwrap());
let s0 = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a4, 1);
let s1 = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a5, 1);
let s2 = _mm256_inserti128_si256(_mm256_castsi128_si256(a2), a6, 1);
let s3 = _mm256_inserti128_si256(_mm256_castsi128_si256(a3), a7, 1);
let s4 = _mm256_inserti128_si256(_mm256_castsi128_si256(a4), a8, 1);
let s5 = _mm256_inserti128_si256(_mm256_castsi128_si256(a5), a9, 1);
let s6 = _mm256_inserti128_si256(_mm256_castsi128_si256(a6), a10, 1);
let s7 = _mm256_inserti128_si256(_mm256_castsi128_si256(a7), a11, 1);
let p01 = _mm256_unpacklo_epi16(s0, s1);
let p23 = _mm256_unpacklo_epi16(s2, s3);
let p45 = _mm256_unpacklo_epi16(s4, s5);
let p67 = _mm256_unpacklo_epi16(s6, s7);
let ma01 = _mm256_madd_epi16(p01, coeff0);
let ma23 = _mm256_madd_epi16(p23, coeff2);
let ma45 = _mm256_madd_epi16(p45, coeff4);
let ma67 = _mm256_madd_epi16(p67, coeff6);
let sum = _mm256_add_epi32(_mm256_add_epi32(ma01, ma23), _mm256_add_epi32(ma45, ma67));
let result = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
storeu_256!(
<&mut [i32; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
result
);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
let r = (1 << sh) >> 1;
dst[col] = (sum + r) >> sh;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_filter_8tap_16bpc_avx2(
dst: *mut i32,
src: *const u16,
w: usize,
filter: &[i8; 8],
sh: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_filter_8tap_16bpc_avx2_inner(token, dst, src, w, filter, sh) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_16bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
filter: &[i8; 8],
sh: i32,
max: i32,
) {
let mut dst = dst.flex_mut();
let coeff: [i32; 8] = [
filter[0] as i32,
filter[1] as i32,
filter[2] as i32,
filter[3] as i32,
filter[4] as i32,
filter[5] as i32,
filter[6] as i32,
filter[7] as i32,
];
let rnd = _mm256_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi32(max);
let mut col = 0usize;
while col + 8 <= w {
let r0 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 0][col..col + 8]).unwrap());
let r1 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 1][col..col + 8]).unwrap());
let r2 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 2][col..col + 8]).unwrap());
let r3 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 3][col..col + 8]).unwrap());
let r4 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 4][col..col + 8]).unwrap());
let r5 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 5][col..col + 8]).unwrap());
let r6 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 6][col..col + 8]).unwrap());
let r7 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 7][col..col + 8]).unwrap());
let c0 = _mm256_set1_epi32(coeff[0]);
let c1 = _mm256_set1_epi32(coeff[1]);
let c2 = _mm256_set1_epi32(coeff[2]);
let c3 = _mm256_set1_epi32(coeff[3]);
let c4 = _mm256_set1_epi32(coeff[4]);
let c5 = _mm256_set1_epi32(coeff[5]);
let c6 = _mm256_set1_epi32(coeff[6]);
let c7 = _mm256_set1_epi32(coeff[7]);
let m0 = _mm256_mullo_epi32(r0, c0);
let m1 = _mm256_mullo_epi32(r1, c1);
let m2 = _mm256_mullo_epi32(r2, c2);
let m3 = _mm256_mullo_epi32(r3, c3);
let m4 = _mm256_mullo_epi32(r4, c4);
let m5 = _mm256_mullo_epi32(r5, c5);
let m6 = _mm256_mullo_epi32(r6, c6);
let m7 = _mm256_mullo_epi32(r7, c7);
let sum = _mm256_add_epi32(
_mm256_add_epi32(_mm256_add_epi32(m0, m1), _mm256_add_epi32(m2, m3)),
_mm256_add_epi32(_mm256_add_epi32(m4, m5), _mm256_add_epi32(m6, m7)),
);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let clamped_lo = _mm256_max_epi32(shifted, zero);
let clamped = _mm256_min_epi32(clamped_lo, max_val);
let packed = _mm256_packus_epi32(clamped, zero);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [u16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += coeff[i] * mid[y + i][col];
}
let r = (1 << sh) >> 1;
let val = ((sum + r) >> sh).clamp(0, max);
dst[col] = val as u16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_filter_8tap_16bpc_avx2(
dst: *mut u16,
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
filter: &[i8; 8],
sh: i32,
max: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_filter_8tap_16bpc_avx2_inner(token, dst, mid, w, y, filter, sh, max) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_16bpc_prep_avx2_inner(
_token: Desktop64,
dst: &mut [i16],
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let coeff: [i32; 8] = [
filter[0] as i32,
filter[1] as i32,
filter[2] as i32,
filter[3] as i32,
filter[4] as i32,
filter[5] as i32,
filter[6] as i32,
filter[7] as i32,
];
let rnd = _mm256_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let bias = _mm256_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 8 <= w {
let r0 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 0][col..col + 8]).unwrap());
let r1 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 1][col..col + 8]).unwrap());
let r2 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 2][col..col + 8]).unwrap());
let r3 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 3][col..col + 8]).unwrap());
let r4 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 4][col..col + 8]).unwrap());
let r5 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 5][col..col + 8]).unwrap());
let r6 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 6][col..col + 8]).unwrap());
let r7 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 7][col..col + 8]).unwrap());
let c0 = _mm256_set1_epi32(coeff[0]);
let c1 = _mm256_set1_epi32(coeff[1]);
let c2 = _mm256_set1_epi32(coeff[2]);
let c3 = _mm256_set1_epi32(coeff[3]);
let c4 = _mm256_set1_epi32(coeff[4]);
let c5 = _mm256_set1_epi32(coeff[5]);
let c6 = _mm256_set1_epi32(coeff[6]);
let c7 = _mm256_set1_epi32(coeff[7]);
let m0 = _mm256_mullo_epi32(r0, c0);
let m1 = _mm256_mullo_epi32(r1, c1);
let m2 = _mm256_mullo_epi32(r2, c2);
let m3 = _mm256_mullo_epi32(r3, c3);
let m4 = _mm256_mullo_epi32(r4, c4);
let m5 = _mm256_mullo_epi32(r5, c5);
let m6 = _mm256_mullo_epi32(r6, c6);
let m7 = _mm256_mullo_epi32(r7, c7);
let sum = _mm256_add_epi32(
_mm256_add_epi32(_mm256_add_epi32(m0, m1), _mm256_add_epi32(m2, m3)),
_mm256_add_epi32(_mm256_add_epi32(m4, m5), _mm256_add_epi32(m6, m7)),
);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let biased = _mm256_sub_epi32(shifted, bias);
let packed = _mm256_packs_epi32(biased, biased);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += coeff[i] * mid[y + i][col];
}
let r = (1 << sh) >> 1;
let val = ((sum + r) >> sh) - prep_bias;
dst[col] = val as i16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_filter_8tap_16bpc_prep_avx2(
dst: *mut i16,
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_filter_8tap_16bpc_prep_avx2_inner(token, dst, mid, w, y, filter, sh, prep_bias) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_16bpc_put_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
src: &[u16],
w: usize,
filter: &[i8; 8],
max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff0 =
_mm256_set1_epi32(((filter[1] as i16 as i32) << 16) | (filter[0] as i16 as u16 as i32));
let coeff2 =
_mm256_set1_epi32(((filter[3] as i16 as i32) << 16) | (filter[2] as i16 as u16 as i32));
let coeff4 =
_mm256_set1_epi32(((filter[5] as i16 as i32) << 16) | (filter[4] as i16 as u16 as i32));
let coeff6 =
_mm256_set1_epi32(((filter[7] as i16 as i32) << 16) | (filter[6] as i16 as u16 as i32));
let intermediate_bits = if (max >> 11) != 0 { 2 } else { 4 };
let rnd = _mm256_set1_epi32(32 + ((1 << (6 - intermediate_bits)) >> 1));
let shift_count = _mm_cvtsi32_si128(6);
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi32(max);
let mut col = 0usize;
while col + 8 <= w {
let a0 = loadu_128!(<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap());
let a1 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 1..col + 9]).unwrap());
let a2 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 2..col + 10]).unwrap());
let a3 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 3..col + 11]).unwrap());
let a4 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 4..col + 12]).unwrap());
let a5 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 5..col + 13]).unwrap());
let a6 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 6..col + 14]).unwrap());
let a7 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 7..col + 15]).unwrap());
let a8 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 8..col + 16]).unwrap());
let a9 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 9..col + 17]).unwrap());
let a10 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 10..col + 18]).unwrap());
let a11 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 11..col + 19]).unwrap());
let s0 = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a4, 1);
let s1 = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a5, 1);
let s2 = _mm256_inserti128_si256(_mm256_castsi128_si256(a2), a6, 1);
let s3 = _mm256_inserti128_si256(_mm256_castsi128_si256(a3), a7, 1);
let s4 = _mm256_inserti128_si256(_mm256_castsi128_si256(a4), a8, 1);
let s5 = _mm256_inserti128_si256(_mm256_castsi128_si256(a5), a9, 1);
let s6 = _mm256_inserti128_si256(_mm256_castsi128_si256(a6), a10, 1);
let s7 = _mm256_inserti128_si256(_mm256_castsi128_si256(a7), a11, 1);
let p01 = _mm256_unpacklo_epi16(s0, s1);
let p23 = _mm256_unpacklo_epi16(s2, s3);
let p45 = _mm256_unpacklo_epi16(s4, s5);
let p67 = _mm256_unpacklo_epi16(s6, s7);
let ma01 = _mm256_madd_epi16(p01, coeff0);
let ma23 = _mm256_madd_epi16(p23, coeff2);
let ma45 = _mm256_madd_epi16(p45, coeff4);
let ma67 = _mm256_madd_epi16(p67, coeff6);
let sum = _mm256_add_epi32(_mm256_add_epi32(ma01, ma23), _mm256_add_epi32(ma45, ma67));
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let clamped_lo = _mm256_max_epi32(shifted, zero);
let clamped = _mm256_min_epi32(clamped_lo, max_val);
let packed = _mm256_packus_epi32(clamped, zero);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [u16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
let scalar_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
let val = ((sum + scalar_rnd) >> 6).clamp(0, max);
dst[col] = val as u16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_filter_8tap_16bpc_put_avx2(
dst: *mut u16,
src: *const u16,
w: usize,
filter: &[i8; 8],
max: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_filter_8tap_16bpc_put_avx2_inner(token, dst, src, w, filter, max) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_16bpc_direct_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
src: &[u16],
src_stride: isize,
w: usize,
filter: &[i8; 8],
max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff: [i32; 8] = [
filter[0] as i32,
filter[1] as i32,
filter[2] as i32,
filter[3] as i32,
filter[4] as i32,
filter[5] as i32,
filter[6] as i32,
filter[7] as i32,
];
let c0 = _mm256_set1_epi32(coeff[0]);
let c1 = _mm256_set1_epi32(coeff[1]);
let c2 = _mm256_set1_epi32(coeff[2]);
let c3 = _mm256_set1_epi32(coeff[3]);
let c4 = _mm256_set1_epi32(coeff[4]);
let c5 = _mm256_set1_epi32(coeff[5]);
let c6 = _mm256_set1_epi32(coeff[6]);
let c7 = _mm256_set1_epi32(coeff[7]);
let rnd = _mm256_set1_epi32(32);
let shift_count = _mm_cvtsi32_si128(6);
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi32(max);
let mut col = 0usize;
let stride_u = src_stride as usize;
while col + 8 <= w {
let p0 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap()
));
let p1 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[stride_u + col..stride_u + col + 8]).unwrap()
));
let p2 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[2 * stride_u + col..2 * stride_u + col + 8]).unwrap()
));
let p3 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[3 * stride_u + col..3 * stride_u + col + 8]).unwrap()
));
let p4 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[4 * stride_u + col..4 * stride_u + col + 8]).unwrap()
));
let p5 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[5 * stride_u + col..5 * stride_u + col + 8]).unwrap()
));
let p6 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[6 * stride_u + col..6 * stride_u + col + 8]).unwrap()
));
let p7 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[7 * stride_u + col..7 * stride_u + col + 8]).unwrap()
));
let m0 = _mm256_mullo_epi32(p0, c0);
let m1 = _mm256_mullo_epi32(p1, c1);
let m2 = _mm256_mullo_epi32(p2, c2);
let m3 = _mm256_mullo_epi32(p3, c3);
let m4 = _mm256_mullo_epi32(p4, c4);
let m5 = _mm256_mullo_epi32(p5, c5);
let m6 = _mm256_mullo_epi32(p6, c6);
let m7 = _mm256_mullo_epi32(p7, c7);
let sum = _mm256_add_epi32(
_mm256_add_epi32(_mm256_add_epi32(m0, m1), _mm256_add_epi32(m2, m3)),
_mm256_add_epi32(_mm256_add_epi32(m4, m5), _mm256_add_epi32(m6, m7)),
);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let clamped_lo = _mm256_max_epi32(shifted, zero);
let clamped = _mm256_min_epi32(clamped_lo, max_val);
let packed = _mm256_packus_epi32(clamped, zero);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [u16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
let px = src[i * stride_u + col] as i32;
sum += coeff[i] * px;
}
let val = ((sum + 32) >> 6).clamp(0, max);
dst[col] = val as u16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_filter_8tap_16bpc_direct_avx2(
dst: *mut u16,
src: *const u16,
src_stride: isize,
w: usize,
filter: &[i8; 8],
max: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_filter_8tap_16bpc_direct_avx2_inner(token, dst, src, src_stride, w, filter, max) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_16bpc_prep_direct_avx2_inner(
_token: Desktop64,
dst: &mut [i16],
src: &[u16],
w: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff0 =
_mm256_set1_epi32(((filter[1] as i16 as i32) << 16) | (filter[0] as i16 as u16 as i32));
let coeff2 =
_mm256_set1_epi32(((filter[3] as i16 as i32) << 16) | (filter[2] as i16 as u16 as i32));
let coeff4 =
_mm256_set1_epi32(((filter[5] as i16 as i32) << 16) | (filter[4] as i16 as u16 as i32));
let coeff6 =
_mm256_set1_epi32(((filter[7] as i16 as i32) << 16) | (filter[6] as i16 as u16 as i32));
let rnd = _mm256_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let bias = _mm256_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 8 <= w {
let a0 = loadu_128!(<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap());
let a1 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 1..col + 9]).unwrap());
let a2 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 2..col + 10]).unwrap());
let a3 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 3..col + 11]).unwrap());
let a4 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 4..col + 12]).unwrap());
let a5 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 5..col + 13]).unwrap());
let a6 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 6..col + 14]).unwrap());
let a7 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 7..col + 15]).unwrap());
let a8 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 8..col + 16]).unwrap());
let a9 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 9..col + 17]).unwrap());
let a10 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 10..col + 18]).unwrap());
let a11 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 11..col + 19]).unwrap());
let s0 = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a4, 1);
let s1 = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a5, 1);
let s2 = _mm256_inserti128_si256(_mm256_castsi128_si256(a2), a6, 1);
let s3 = _mm256_inserti128_si256(_mm256_castsi128_si256(a3), a7, 1);
let s4 = _mm256_inserti128_si256(_mm256_castsi128_si256(a4), a8, 1);
let s5 = _mm256_inserti128_si256(_mm256_castsi128_si256(a5), a9, 1);
let s6 = _mm256_inserti128_si256(_mm256_castsi128_si256(a6), a10, 1);
let s7 = _mm256_inserti128_si256(_mm256_castsi128_si256(a7), a11, 1);
let p01 = _mm256_unpacklo_epi16(s0, s1);
let p23 = _mm256_unpacklo_epi16(s2, s3);
let p45 = _mm256_unpacklo_epi16(s4, s5);
let p67 = _mm256_unpacklo_epi16(s6, s7);
let ma01 = _mm256_madd_epi16(p01, coeff0);
let ma23 = _mm256_madd_epi16(p23, coeff2);
let ma45 = _mm256_madd_epi16(p45, coeff4);
let ma67 = _mm256_madd_epi16(p67, coeff6);
let sum = _mm256_add_epi32(_mm256_add_epi32(ma01, ma23), _mm256_add_epi32(ma45, ma67));
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let biased = _mm256_sub_epi32(shifted, bias);
let packed = _mm256_packs_epi32(biased, biased);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
let r = (1 << sh) >> 1;
let val = ((sum + r) >> sh) - prep_bias;
dst[col] = val as i16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_filter_8tap_16bpc_prep_direct_avx2(
dst: *mut i16,
src: *const u16,
w: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_filter_8tap_16bpc_prep_direct_avx2_inner(token, dst, src, w, filter, sh, prep_bias) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_16bpc_prep_direct_avx2_inner(
_token: Desktop64,
dst: &mut [i16],
src: &[u16],
src_stride: isize,
w: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let coeff: [i32; 8] = [
filter[0] as i32,
filter[1] as i32,
filter[2] as i32,
filter[3] as i32,
filter[4] as i32,
filter[5] as i32,
filter[6] as i32,
filter[7] as i32,
];
let c0 = _mm256_set1_epi32(coeff[0]);
let c1 = _mm256_set1_epi32(coeff[1]);
let c2 = _mm256_set1_epi32(coeff[2]);
let c3 = _mm256_set1_epi32(coeff[3]);
let c4 = _mm256_set1_epi32(coeff[4]);
let c5 = _mm256_set1_epi32(coeff[5]);
let c6 = _mm256_set1_epi32(coeff[6]);
let c7 = _mm256_set1_epi32(coeff[7]);
let rnd = _mm256_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let bias = _mm256_set1_epi32(prep_bias);
let mut col = 0usize;
let stride_u = src_stride as usize;
while col + 8 <= w {
let p0 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap()
));
let p1 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[stride_u + col..stride_u + col + 8]).unwrap()
));
let p2 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[2 * stride_u + col..2 * stride_u + col + 8]).unwrap()
));
let p3 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[3 * stride_u + col..3 * stride_u + col + 8]).unwrap()
));
let p4 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[4 * stride_u + col..4 * stride_u + col + 8]).unwrap()
));
let p5 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[5 * stride_u + col..5 * stride_u + col + 8]).unwrap()
));
let p6 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[6 * stride_u + col..6 * stride_u + col + 8]).unwrap()
));
let p7 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[7 * stride_u + col..7 * stride_u + col + 8]).unwrap()
));
let m0 = _mm256_mullo_epi32(p0, c0);
let m1 = _mm256_mullo_epi32(p1, c1);
let m2 = _mm256_mullo_epi32(p2, c2);
let m3 = _mm256_mullo_epi32(p3, c3);
let m4 = _mm256_mullo_epi32(p4, c4);
let m5 = _mm256_mullo_epi32(p5, c5);
let m6 = _mm256_mullo_epi32(p6, c6);
let m7 = _mm256_mullo_epi32(p7, c7);
let sum = _mm256_add_epi32(
_mm256_add_epi32(_mm256_add_epi32(m0, m1), _mm256_add_epi32(m2, m3)),
_mm256_add_epi32(_mm256_add_epi32(m4, m5), _mm256_add_epi32(m6, m7)),
);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let biased = _mm256_sub_epi32(shifted, bias);
let packed = _mm256_packs_epi32(biased, biased);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
let px = src[i * stride_u + col] as i32;
sum += coeff[i] * px;
}
let r = (1 << sh) >> 1;
let val = ((sum + r) >> sh) - prep_bias;
dst[col] = val as i16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_filter_8tap_16bpc_prep_direct_avx2(
dst: *mut i16,
src: *const u16,
src_stride: isize,
w: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
v_filter_8tap_16bpc_prep_direct_avx2_inner(
token, dst, src, src_stride, w, filter, sh, prep_bias,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_16bpc_avx512_inner(
_token: Server64,
dst: &mut [i32],
src: &[u16],
w: usize,
filter: &[i8; 8],
sh: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let rnd = _mm512_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 1..col + 17]).unwrap()
));
let p2 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 2..col + 18]).unwrap()
));
let p3 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 3..col + 19]).unwrap()
));
let p4 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 4..col + 20]).unwrap()
));
let p5 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 5..col + 21]).unwrap()
));
let p6 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 6..col + 22]).unwrap()
));
let p7 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 7..col + 23]).unwrap()
));
let mut sum = _mm512_mullo_epi32(p0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p7, c7));
let result = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
storeu_512!(&mut dst[col..col + 16], [i32; 16], result);
col += 16;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
let r = (1 << sh) >> 1;
dst[col] = (sum + r) >> sh;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u16],
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
filter: &[i8; 8],
sh: i32,
max: i32,
) {
let mut dst = dst.flex_mut();
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let rnd = _mm512_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let zero = _mm512_setzero_si512();
let max_val = _mm512_set1_epi32(max);
let mut col = 0usize;
while col + 16 <= w {
let r0 = loadu_512!(&mid[y + 0][col..col + 16], [i32; 16]);
let r1 = loadu_512!(&mid[y + 1][col..col + 16], [i32; 16]);
let r2 = loadu_512!(&mid[y + 2][col..col + 16], [i32; 16]);
let r3 = loadu_512!(&mid[y + 3][col..col + 16], [i32; 16]);
let r4 = loadu_512!(&mid[y + 4][col..col + 16], [i32; 16]);
let r5 = loadu_512!(&mid[y + 5][col..col + 16], [i32; 16]);
let r6 = loadu_512!(&mid[y + 6][col..col + 16], [i32; 16]);
let r7 = loadu_512!(&mid[y + 7][col..col + 16], [i32; 16]);
let mut sum = _mm512_mullo_epi32(r0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r7, c7));
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let clamped = _mm512_min_epi32(_mm512_max_epi32(shifted, zero), max_val);
let packed = _mm512_cvtusepi32_epi16(clamped);
storeu_256!(
<&mut [u16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let coeff: [i32; 8] = core::array::from_fn(|i| filter[i] as i32);
let mut sum = 0i32;
for i in 0..8 {
sum += coeff[i] * mid[y + i][col];
}
let r = (1 << sh) >> 1;
let val = ((sum + r) >> sh).clamp(0, max);
dst[col] = val as u16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_16bpc_prep_avx512_inner(
_token: Server64,
dst: &mut [i16],
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let rnd = _mm512_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let bias = _mm512_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 16 <= w {
let r0 = loadu_512!(&mid[y + 0][col..col + 16], [i32; 16]);
let r1 = loadu_512!(&mid[y + 1][col..col + 16], [i32; 16]);
let r2 = loadu_512!(&mid[y + 2][col..col + 16], [i32; 16]);
let r3 = loadu_512!(&mid[y + 3][col..col + 16], [i32; 16]);
let r4 = loadu_512!(&mid[y + 4][col..col + 16], [i32; 16]);
let r5 = loadu_512!(&mid[y + 5][col..col + 16], [i32; 16]);
let r6 = loadu_512!(&mid[y + 6][col..col + 16], [i32; 16]);
let r7 = loadu_512!(&mid[y + 7][col..col + 16], [i32; 16]);
let mut sum = _mm512_mullo_epi32(r0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(r7, c7));
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let biased = _mm512_sub_epi32(shifted, bias);
let packed = _mm512_cvtsepi32_epi16(biased);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let coeff: [i32; 8] = core::array::from_fn(|i| filter[i] as i32);
let mut sum = 0i32;
for i in 0..8 {
sum += coeff[i] * mid[y + i][col];
}
let r = (1 << sh) >> 1;
let val = ((sum + r) >> sh) - prep_bias;
dst[col] = val as i16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_16bpc_put_avx512_inner(
_token: Server64,
dst: &mut [u16],
src: &[u16],
w: usize,
filter: &[i8; 8],
max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let intermediate_bits = if (max >> 11) != 0 { 2 } else { 4 };
let rnd = _mm512_set1_epi32(32 + ((1 << (6 - intermediate_bits)) >> 1));
let shift_count = _mm_cvtsi32_si128(6);
let zero = _mm512_setzero_si512();
let max_val = _mm512_set1_epi32(max);
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 1..col + 17]).unwrap()
));
let p2 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 2..col + 18]).unwrap()
));
let p3 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 3..col + 19]).unwrap()
));
let p4 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 4..col + 20]).unwrap()
));
let p5 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 5..col + 21]).unwrap()
));
let p6 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 6..col + 22]).unwrap()
));
let p7 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 7..col + 23]).unwrap()
));
let mut sum = _mm512_mullo_epi32(p0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p7, c7));
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let clamped = _mm512_min_epi32(_mm512_max_epi32(shifted, zero), max_val);
let packed = _mm512_cvtusepi32_epi16(clamped);
storeu_256!(
<&mut [u16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
let scalar_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
let val = ((sum + scalar_rnd) >> 6).clamp(0, max);
dst[col] = val as u16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_8tap_16bpc_prep_direct_avx512_inner(
_token: Server64,
dst: &mut [i16],
src: &[u16],
w: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let rnd = _mm512_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let bias = _mm512_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 1..col + 17]).unwrap()
));
let p2 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 2..col + 18]).unwrap()
));
let p3 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 3..col + 19]).unwrap()
));
let p4 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 4..col + 20]).unwrap()
));
let p5 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 5..col + 21]).unwrap()
));
let p6 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 6..col + 22]).unwrap()
));
let p7 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 7..col + 23]).unwrap()
));
let mut sum = _mm512_mullo_epi32(p0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p7, c7));
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let biased = _mm512_sub_epi32(shifted, bias);
let packed = _mm512_cvtsepi32_epi16(biased);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * src[col + i] as i32;
}
let r = (1 << sh) >> 1;
let val = ((sum + r) >> sh) - prep_bias;
dst[col] = val as i16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_16bpc_direct_avx512_inner(
_token: Server64,
dst: &mut [u16],
src: &[u16],
src_stride: isize,
w: usize,
filter: &[i8; 8],
max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let rnd = _mm512_set1_epi32(32);
let shift_count = _mm_cvtsi32_si128(6);
let zero = _mm512_setzero_si512();
let max_val = _mm512_set1_epi32(max);
let stride_u = src_stride as usize;
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[stride_u + col..stride_u + col + 16]).unwrap()
));
let p2 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[2 * stride_u + col..2 * stride_u + col + 16]).unwrap()
));
let p3 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[3 * stride_u + col..3 * stride_u + col + 16]).unwrap()
));
let p4 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[4 * stride_u + col..4 * stride_u + col + 16]).unwrap()
));
let p5 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[5 * stride_u + col..5 * stride_u + col + 16]).unwrap()
));
let p6 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[6 * stride_u + col..6 * stride_u + col + 16]).unwrap()
));
let p7 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[7 * stride_u + col..7 * stride_u + col + 16]).unwrap()
));
let mut sum = _mm512_mullo_epi32(p0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p7, c7));
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let clamped = _mm512_min_epi32(_mm512_max_epi32(shifted, zero), max_val);
let packed = _mm512_cvtusepi32_epi16(clamped);
storeu_256!(
<&mut [u16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
let coeff: [i32; 8] = core::array::from_fn(|i| filter[i] as i32);
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += coeff[i] * src[i * stride_u + col] as i32;
}
let val = ((sum + 32) >> 6).clamp(0, max);
dst[col] = val as u16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_8tap_16bpc_prep_direct_avx512_inner(
_token: Server64,
dst: &mut [i16],
src: &[u16],
src_stride: isize,
w: usize,
filter: &[i8; 8],
sh: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let c0 = _mm512_set1_epi32(filter[0] as i32);
let c1 = _mm512_set1_epi32(filter[1] as i32);
let c2 = _mm512_set1_epi32(filter[2] as i32);
let c3 = _mm512_set1_epi32(filter[3] as i32);
let c4 = _mm512_set1_epi32(filter[4] as i32);
let c5 = _mm512_set1_epi32(filter[5] as i32);
let c6 = _mm512_set1_epi32(filter[6] as i32);
let c7 = _mm512_set1_epi32(filter[7] as i32);
let rnd = _mm512_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let bias = _mm512_set1_epi32(prep_bias);
let stride_u = src_stride as usize;
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[stride_u + col..stride_u + col + 16]).unwrap()
));
let p2 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[2 * stride_u + col..2 * stride_u + col + 16]).unwrap()
));
let p3 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[3 * stride_u + col..3 * stride_u + col + 16]).unwrap()
));
let p4 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[4 * stride_u + col..4 * stride_u + col + 16]).unwrap()
));
let p5 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[5 * stride_u + col..5 * stride_u + col + 16]).unwrap()
));
let p6 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[6 * stride_u + col..6 * stride_u + col + 16]).unwrap()
));
let p7 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[7 * stride_u + col..7 * stride_u + col + 16]).unwrap()
));
let mut sum = _mm512_mullo_epi32(p0, c0);
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p1, c1));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p2, c2));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p3, c3));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p4, c4));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p5, c5));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p6, c6));
sum = _mm512_add_epi32(sum, _mm512_mullo_epi32(p7, c7));
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let biased = _mm512_sub_epi32(shifted, bias);
let packed = _mm512_cvtsepi32_epi16(biased);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
let coeff: [i32; 8] = core::array::from_fn(|i| filter[i] as i32);
while col < w {
let mut sum = 0i32;
for i in 0..8 {
sum += coeff[i] * src[i * stride_u + col] as i32;
}
let r = (1 << sh) >> 1;
let val = ((sum + r) >> sh) - prep_bias;
dst[col] = val as i16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn put_8tap_16bpc_avx512_impl_inner(
_token: Server64,
dst: &mut [u16],
dst_stride: isize,
src: &[u16],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let dst_stride_elems = dst_stride / 2;
let src_stride_elems = src_stride / 2;
let sb = src_base as isize;
let max = bitdepth_max as i32;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = take_mid_i32_135();
let h_sh = 6 - intermediate_bits;
let v_sh = 6 + intermediate_bits;
for y in 0..tmp_h {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
h_filter_8tap_16bpc_avx512_inner(
_token,
&mut mid[y],
&src[src_off - 3..],
w,
fh,
h_sh,
);
}
for y in 0..h {
let dst_row = &mut dst[(y as isize * dst_stride_elems) as usize..];
v_filter_8tap_16bpc_avx512_inner(_token, dst_row, &*mid, w, y, fv, v_sh, max);
}
put_mid_i32_135(mid);
}
(Some(fh), None) => {
for y in 0..h {
let src_off = (sb + y as isize * src_stride_elems) as usize;
let dst_row = &mut dst[(y as isize * dst_stride_elems) as usize..];
h_filter_8tap_16bpc_put_avx512_inner(
_token,
dst_row,
&src[src_off - 3..],
w,
fh,
max,
);
}
}
(None, Some(fv)) => {
for y in 0..h {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
let dst_row = &mut dst[(y as isize * dst_stride_elems) as usize..];
v_filter_8tap_16bpc_direct_avx512_inner(
_token,
dst_row,
&src[src_off..],
src_stride_elems,
w,
fv,
max,
);
}
}
(None, None) => {
for y in 0..h {
let src_row = &src[(sb + y as isize * src_stride_elems) as usize..];
let dst_row = &mut dst[(y as isize * dst_stride_elems) as usize..];
dst_row[..w].copy_from_slice(&src_row[..w]);
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn prep_8tap_16bpc_avx512_impl_inner(
_token: Server64,
tmp: &mut [i16],
src: &[u16],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let src_stride_elems = src_stride / 2;
let sb = src_base as isize;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
const PREP_BIAS: i32 = 8192;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = take_mid_i32_135();
let h_sh = 6 - intermediate_bits;
let v_sh = 6;
for y in 0..tmp_h {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
h_filter_8tap_16bpc_avx512_inner(
_token,
&mut mid[y],
&src[src_off - 3..],
w,
fh,
h_sh,
);
}
for y in 0..h {
let out_row = y * w;
v_filter_8tap_16bpc_prep_avx512_inner(
_token,
&mut tmp[out_row..],
&*mid,
w,
y,
fv,
v_sh,
PREP_BIAS,
);
}
put_mid_i32_135(mid);
}
(Some(fh), None) => {
let sh = 6 - intermediate_bits;
for y in 0..h {
let src_off = (sb + y as isize * src_stride_elems) as usize;
let out_row = y * w;
h_filter_8tap_16bpc_prep_direct_avx512_inner(
_token,
&mut tmp[out_row..],
&src[src_off - 3..],
w,
fh,
sh,
PREP_BIAS,
);
}
}
(None, Some(fv)) => {
let sh = 6 - intermediate_bits;
for y in 0..h {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
let out_row = y * w;
v_filter_8tap_16bpc_prep_direct_avx512_inner(
_token,
&mut tmp[out_row..],
&src[src_off..],
src_stride_elems,
w,
fv,
sh,
PREP_BIAS,
);
}
}
(None, None) => {
for y in 0..h {
let src_row = &src[(sb + y as isize * src_stride_elems) as usize..];
let out_row = y * w;
for x in 0..w {
let px = src_row[x] as i32;
let val = (px << intermediate_bits) - PREP_BIAS;
tmp[out_row + x] = val as i16;
}
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn put_8tap_16bpc_avx2_impl_inner(
_token: Desktop64,
dst: &mut [u16],
dst_stride: isize,
src: &[u16],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let dst_stride_elems = dst_stride / 2;
let src_stride_elems = src_stride / 2;
let sb = src_base as isize;
let max = bitdepth_max as i32;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
const USE_SIMD_HV: bool = true;
const USE_SIMD_H: bool = true;
const USE_SIMD_V: bool = true;
match (fh, fv) {
(Some(fh), Some(fv)) => {
if USE_SIMD_HV {
let tmp_h = h + 7;
let mut mid = take_mid_i32_135();
let h_sh = 6 - intermediate_bits;
let v_sh = 6 + intermediate_bits;
for y in 0..tmp_h {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
h_filter_8tap_16bpc_avx2_inner(
_token,
&mut mid[y],
&src[src_off - 3..],
w,
fh,
h_sh,
);
}
for y in 0..h {
let dst_row = &mut dst[(y as isize * dst_stride_elems) as usize..];
v_filter_8tap_16bpc_avx2_inner(_token, dst_row, &*mid, w, y, fv, v_sh, max);
}
put_mid_i32_135(mid);
} else {
let tmp_h = h + 7;
let mut mid = take_mid_i32_135();
let h_rnd = (1 << (6 - intermediate_bits)) >> 1;
let h_sh = 6 - intermediate_bits;
for y in 0..tmp_h {
for x in 0..w {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
let mut sum = 0i32;
for k in 0..8 {
let sx = src_off - 3 + x + k;
sum += src[sx] as i32 * fh[k] as i32;
}
mid[y][x] = (sum + h_rnd) >> h_sh;
}
}
let v_sh = 6 + intermediate_bits;
let v_rnd = (1 << v_sh) >> 1;
for y in 0..h {
for x in 0..w {
let mut sum = 0i32;
for k in 0..8 {
sum += mid[y + k][x] * fv[k] as i32;
}
let val = ((sum + v_rnd) >> v_sh).clamp(0, max);
dst[(y as isize * dst_stride_elems) as usize + x] = val as u16;
}
}
put_mid_i32_135(mid);
}
}
(Some(fh), None) => {
if USE_SIMD_H {
for y in 0..h {
let src_off = (sb + y as isize * src_stride_elems) as usize;
let dst_row = &mut dst[(y as isize * dst_stride_elems) as usize..];
h_filter_8tap_16bpc_put_avx2_inner(
_token,
dst_row,
&src[src_off - 3..],
w,
fh,
max,
);
}
} else {
let intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
for y in 0..h {
let src_off = (sb + y as isize * src_stride_elems) as usize;
for x in 0..w {
let mut sum = 0i32;
for k in 0..8 {
sum += src[src_off + x + k] as i32 * fh[k] as i32;
}
let val = ((sum + intermediate_rnd) >> 6).clamp(0, max);
dst[(y as isize * dst_stride_elems) as usize + x] = val as u16;
}
}
}
}
(None, Some(fv)) => {
if USE_SIMD_V {
for y in 0..h {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
let dst_row = &mut dst[(y as isize * dst_stride_elems) as usize..];
v_filter_8tap_16bpc_direct_avx2_inner(
_token,
dst_row,
&src[src_off..],
src_stride_elems,
w,
fv,
max,
);
}
} else {
for y in 0..h {
for x in 0..w {
let mut sum = 0i32;
for k in 0..8 {
let src_off =
(sb + (y as isize + k as isize - 3) * src_stride_elems) as usize;
sum += src[src_off + x] as i32 * fv[k] as i32;
}
let val = ((sum + 32) >> 6).clamp(0, max);
dst[(y as isize * dst_stride_elems) as usize + x] = val as u16;
}
}
}
}
(None, None) => {
for y in 0..h {
let src_row = &src[(sb + y as isize * src_stride_elems) as usize..];
let dst_row = &mut dst[(y as isize * dst_stride_elems) as usize..];
dst_row[..w].copy_from_slice(&src_row[..w]);
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn put_8tap_16bpc_avx2_impl(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_16bpc_avx2_impl_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
0,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
h_filter_type,
v_filter_type,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn prep_8tap_16bpc_avx2_impl_inner(
_token: Desktop64,
tmp: &mut [i16],
src: &[u16],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let src_stride_elems = src_stride / 2;
let sb = src_base as isize;
let intermediate_bits = if (bitdepth_max >> 11) != 0 {
2i32
} else {
4i32
};
const PREP_BIAS: i32 = 8192;
let fh = get_filter_coeff(mx, w, h_filter_type);
let fv = get_filter_coeff(my, h, v_filter_type);
match (fh, fv) {
(Some(fh), Some(fv)) => {
let tmp_h = h + 7;
let mut mid = take_mid_i32_135();
let h_sh = 6 - intermediate_bits; let v_sh = 6;
for y in 0..tmp_h {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
h_filter_8tap_16bpc_avx2_inner(
_token,
&mut mid[y],
&src[src_off - 3..],
w,
fh,
h_sh,
);
}
for y in 0..h {
let out_row = y * w;
v_filter_8tap_16bpc_prep_avx2_inner(
_token,
&mut tmp[out_row..],
&*mid,
w,
y,
fv,
v_sh,
PREP_BIAS,
);
}
put_mid_i32_135(mid);
}
(Some(fh), None) => {
let sh = 6 - intermediate_bits; for y in 0..h {
let src_off = (sb + y as isize * src_stride_elems) as usize;
let out_row = y * w;
h_filter_8tap_16bpc_prep_direct_avx2_inner(
_token,
&mut tmp[out_row..],
&src[src_off - 3..],
w,
fh,
sh,
PREP_BIAS,
);
}
}
(None, Some(fv)) => {
let sh = 6 - intermediate_bits; for y in 0..h {
let src_off = (sb + (y as isize - 3) * src_stride_elems) as usize;
let out_row = y * w;
v_filter_8tap_16bpc_prep_direct_avx2_inner(
_token,
&mut tmp[out_row..],
&src[src_off..],
src_stride_elems,
w,
fv,
sh,
PREP_BIAS,
);
}
}
(None, None) => {
for y in 0..h {
let src_row = &src[(sb + y as isize * src_stride_elems) as usize..];
let out_row = y * w;
for x in 0..w {
let px = src_row[x] as i32;
let val = (px << intermediate_bits) - PREP_BIAS;
tmp[out_row + x] = val as i16;
}
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn prep_8tap_16bpc_avx2_impl(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
h_filter_type: Rav1dFilterMode,
v_filter_type: Rav1dFilterMode,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_16bpc_avx2_impl_inner(
token,
tmp,
src_ptr,
0,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
h_filter_type,
v_filter_type,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_16bpc_avx2<const FILTER: usize>(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
_src: *const FFISafe<PicOffset>,
) {
let filter = Filter2d::from_repr(FILTER).unwrap();
let (h_filter, v_filter) = filter.hv();
unsafe {
put_8tap_16bpc_avx2_impl(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
h_filter,
v_filter,
);
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_16bpc_avx2<const FILTER: usize>(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
_src: *const FFISafe<PicOffset>,
) {
let filter = Filter2d::from_repr(FILTER).unwrap();
let (h_filter, v_filter) = filter.hv();
unsafe {
prep_8tap_16bpc_avx2_impl(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
h_filter,
v_filter,
);
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_regular_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::Regular8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_regular_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_regular_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_regular_smooth_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::RegularSmooth8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_regular_smooth_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_regular_smooth_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_regular_sharp_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::RegularSharp8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_regular_sharp_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_regular_sharp_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_smooth_regular_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::SmoothRegular8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_smooth_regular_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_smooth_regular_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_smooth_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::Smooth8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_smooth_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_smooth_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_smooth_sharp_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::SmoothSharp8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_smooth_sharp_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_smooth_sharp_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_sharp_regular_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::SharpRegular8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_sharp_regular_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_sharp_regular_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_sharp_smooth_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::SharpSmooth8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_sharp_smooth_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_sharp_smooth_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_8tap_sharp_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
unsafe {
put_8tap_16bpc_avx2::<{ Filter2d::Sharp8Tap as usize }>(
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_8tap_sharp_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
dst: *const FFISafe<PicOffset>,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_8tap_sharp_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
dst,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_regular_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::Regular8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_regular_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_regular_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_regular_smooth_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::RegularSmooth8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_regular_smooth_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_regular_smooth_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_regular_sharp_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::RegularSharp8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_regular_sharp_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_regular_sharp_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_smooth_regular_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::SmoothRegular8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_smooth_regular_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_smooth_regular_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_smooth_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::Smooth8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_smooth_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_smooth_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_smooth_sharp_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::SmoothSharp8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_smooth_sharp_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_smooth_sharp_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_sharp_regular_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::SharpRegular8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_sharp_regular_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_sharp_regular_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_sharp_smooth_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::SharpSmooth8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_sharp_smooth_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_sharp_smooth_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_8tap_sharp_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
unsafe {
prep_8tap_16bpc_avx2::<{ Filter2d::Sharp8Tap as usize }>(
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_8tap_sharp_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
prep_8tap_sharp_16bpc_avx2_inner(
token,
tmp,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
src,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_bilin_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [i16],
src: &[u8],
w: usize,
mx: usize,
sh: u8,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let mx = mx as i8;
let coeff0 = (16 - mx) as u8;
let coeff1 = mx as u8;
let coeffs = _mm256_set1_epi16(((coeff1 as i16) << 8) | (coeff0 as i16));
let rnd = if sh > 0 {
_mm256_set1_epi16(1 << (sh - 1))
} else {
_mm256_setzero_si256()
};
let sh_reg = _mm_cvtsi32_si128(sh as i32);
let mut x = 0;
while x + 32 <= w {
let src_lo = loadu_256!(<&[u8; 32]>::try_from(&src[x..x + 32]).unwrap());
let src_hi = loadu_256!(<&[u8; 32]>::try_from(&src[x + 1..x + 33]).unwrap());
let pairs_lo = _mm256_unpacklo_epi8(src_lo, src_hi);
let pairs_hi = _mm256_unpackhi_epi8(src_lo, src_hi);
let result_lo = _mm256_maddubs_epi16(pairs_lo, coeffs);
let result_hi = _mm256_maddubs_epi16(pairs_hi, coeffs);
let result_lo = _mm256_sra_epi16(_mm256_add_epi16(result_lo, rnd), sh_reg);
let result_hi = _mm256_sra_epi16(_mm256_add_epi16(result_hi, rnd), sh_reg);
let lo_128 = _mm256_permute2x128_si256(result_lo, result_hi, 0x20); let hi_128 = _mm256_permute2x128_si256(result_lo, result_hi, 0x31);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[x..x + 16]).unwrap(),
lo_128
);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[x + 16..x + 32]).unwrap(),
hi_128
);
x += 32;
}
while x < w {
let x0 = src[x] as i32;
let x1 = src[x + 1] as i32;
let pixel = (16 - mx as i32) * x0 + mx as i32 * x1;
let result = if sh > 0 {
(pixel + (1 << (sh - 1))) >> sh
} else {
pixel
};
dst[x] = result as i16;
x += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_filter_bilin_8bpc_avx2(dst: *mut i16, src: *const u8, w: usize, mx: usize, sh: u8) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_filter_bilin_8bpc_avx2_inner(token, dst, src, w, mx, sh) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_bilin_8bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
mid: &[&[i16]],
w: usize,
my: usize,
sh: u8,
bd_max: i16,
) {
let mut dst = dst.flex_mut();
let my = my as i32;
let coeff0 = 16 - my;
let coeff1 = my;
let c0_32 = _mm256_set1_epi32(coeff0);
let c1_32 = _mm256_set1_epi32(coeff1);
let rnd = _mm256_set1_epi32(if sh > 0 { 1 << (sh - 1) } else { 0 });
let zero = _mm256_setzero_si256();
let max_32 = _mm256_set1_epi32(bd_max as i32);
let sh_reg = _mm_cvtsi32_si128(sh as i32);
let mut x = 0;
while x + 8 <= w {
let row0 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[0][x..x + 8]).unwrap()
));
let row1 = _mm256_cvtepi16_epi32(loadu_128!(
<&[i16; 8]>::try_from(&mid[1][x..x + 8]).unwrap()
));
let mul0 = _mm256_mullo_epi32(row0, c0_32);
let mul1 = _mm256_mullo_epi32(row1, c1_32);
let sum = _mm256_add_epi32(mul0, mul1);
let result = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), sh_reg);
let result = _mm256_max_epi32(result, zero);
let result = _mm256_min_epi32(result, max_32);
let packed16 = _mm256_packs_epi32(result, result);
let packed16 = _mm256_permute4x64_epi64(packed16, 0xD8);
let packed8 = _mm256_packus_epi16(packed16, packed16);
let result_64 = _mm256_extract_epi64(packed8, 0);
dst[x..x + 8].copy_from_slice(&result_64.to_ne_bytes());
x += 8;
}
while x < w {
let r0 = mid[0][x] as i32;
let r1 = mid[1][x] as i32;
let pixel = coeff0 * r0 + coeff1 * r1;
let result = if sh > 0 {
((pixel + (1 << (sh - 1))) >> sh).clamp(0, bd_max as i32)
} else {
pixel.clamp(0, bd_max as i32)
};
dst[x] = result as u8;
x += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_filter_bilin_8bpc_avx2(
dst: *mut u8,
mid: &[&[i16]],
w: usize,
my: usize,
sh: u8,
bd_max: i16,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_filter_bilin_8bpc_avx2_inner(token, dst, mid, w, my, sh, bd_max) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_bilin_8bpc_put_avx2_inner(_token: Desktop64, dst: &mut [u8], src: &[u8], w: usize, mx: usize) {
let mut dst = dst.flex_mut();
let src = src.flex();
let mx = mx as i8;
let coeff0 = (16 - mx) as u8;
let coeff1 = mx as u8;
let coeffs = _mm256_set1_epi16(((coeff1 as i16) << 8) | (coeff0 as i16));
let rnd = _mm256_set1_epi16(8); let zero = _mm256_setzero_si256();
let max = _mm256_set1_epi16(255);
let mut x = 0;
while x + 32 <= w {
let src_lo = loadu_256!(<&[u8; 32]>::try_from(&src[x..x + 32]).unwrap());
let src_hi = loadu_256!(<&[u8; 32]>::try_from(&src[x + 1..x + 33]).unwrap());
let pairs_lo = _mm256_unpacklo_epi8(src_lo, src_hi);
let pairs_hi = _mm256_unpackhi_epi8(src_lo, src_hi);
let result_lo = _mm256_maddubs_epi16(pairs_lo, coeffs);
let result_hi = _mm256_maddubs_epi16(pairs_hi, coeffs);
let sh_reg = _mm_cvtsi32_si128(4);
let result_lo = _mm256_sra_epi16(_mm256_add_epi16(result_lo, rnd), sh_reg);
let result_hi = _mm256_sra_epi16(_mm256_add_epi16(result_hi, rnd), sh_reg);
let result_lo = _mm256_max_epi16(_mm256_min_epi16(result_lo, max), zero);
let result_hi = _mm256_max_epi16(_mm256_min_epi16(result_hi, max), zero);
let packed = _mm256_packus_epi16(result_lo, result_hi);
storeu_256!(
<&mut [u8; 32]>::try_from(&mut dst[x..x + 32]).unwrap(),
packed
);
x += 32;
}
while x < w {
let x0 = src[x] as i32;
let x1 = src[x + 1] as i32;
let pixel = (16 - mx as i32) * x0 + mx as i32 * x1;
let result = ((pixel + 8) >> 4).clamp(0, 255);
dst[x] = result as u8;
x += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_bilin_8bpc_put_avx2(dst: *mut u8, src: *const u8, w: usize, mx: usize) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_bilin_8bpc_put_avx2_inner(token, dst, src, w, mx) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_8bpc_direct_avx2_inner(
_token: Desktop64,
dst: &mut [u8],
src0: &[u8],
src1: &[u8],
w: usize,
my: usize,
) {
let mut dst = dst.flex_mut();
let src0 = src0.flex();
let src1 = src1.flex();
let my = my as i8;
let coeff0 = (16 - my) as u8;
let coeff1 = my as u8;
let coeffs = _mm256_set1_epi16(((coeff1 as i16) << 8) | (coeff0 as i16));
let rnd = _mm256_set1_epi16(8);
let sh_reg = _mm_cvtsi32_si128(4);
let mut x = 0;
while x + 32 <= w {
let s0 = loadu_256!(<&[u8; 32]>::try_from(&src0[x..x + 32]).unwrap());
let s1 = loadu_256!(<&[u8; 32]>::try_from(&src1[x..x + 32]).unwrap());
let pairs_lo = _mm256_unpacklo_epi8(s0, s1);
let pairs_hi = _mm256_unpackhi_epi8(s0, s1);
let result_lo = _mm256_maddubs_epi16(pairs_lo, coeffs);
let result_hi = _mm256_maddubs_epi16(pairs_hi, coeffs);
let result_lo = _mm256_sra_epi16(_mm256_add_epi16(result_lo, rnd), sh_reg);
let result_hi = _mm256_sra_epi16(_mm256_add_epi16(result_hi, rnd), sh_reg);
let packed = _mm256_packus_epi16(result_lo, result_hi);
storeu_256!(
<&mut [u8; 32]>::try_from(&mut dst[x..x + 32]).unwrap(),
packed
);
x += 32;
}
while x < w {
let x0 = src0[x] as i32;
let x1 = src1[x] as i32;
let pixel = (16 - my as i32) * x0 + my as i32 * x1;
let result = ((pixel + 8) >> 4).clamp(0, 255);
dst[x] = result as u8;
x += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_bilin_8bpc_direct_avx2(
dst: *mut u8,
src0: *const u8,
src1: *const u8,
w: usize,
my: usize,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_bilin_8bpc_direct_avx2_inner(token, dst, src0, src1, w, my) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_filter_bilin_8bpc_avx512_inner(
_token: Server64,
dst: &mut [i16],
src: &[u8],
w: usize,
mx: usize,
sh: u8,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let mx = mx as i8;
let coeff0 = (16 - mx) as u8;
let coeff1 = mx as u8;
let coeffs = _mm512_set1_epi16(((coeff1 as i16) << 8) | (coeff0 as i16));
let rnd = if sh > 0 {
_mm512_set1_epi16(1 << (sh - 1))
} else {
_mm512_setzero_si512()
};
let sh_reg = _mm_cvtsi32_si128(sh as i32);
let idx0 = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
let idx1 = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
let mut x = 0;
while x + 64 <= w {
let src_lo = loadu_512!(&src[x..x + 64], [u8; 64]);
let src_hi = loadu_512!(&src[x + 1..x + 65], [u8; 64]);
let pairs_lo = _mm512_unpacklo_epi8(src_lo, src_hi);
let pairs_hi = _mm512_unpackhi_epi8(src_lo, src_hi);
let result_lo = _mm512_maddubs_epi16(pairs_lo, coeffs);
let result_hi = _mm512_maddubs_epi16(pairs_hi, coeffs);
let result_lo = _mm512_sra_epi16(_mm512_add_epi16(result_lo, rnd), sh_reg);
let result_hi = _mm512_sra_epi16(_mm512_add_epi16(result_hi, rnd), sh_reg);
let out0 = _mm512_permutex2var_epi64(result_lo, idx0, result_hi);
let out1 = _mm512_permutex2var_epi64(result_lo, idx1, result_hi);
storeu_512!(
<&mut [i16; 32]>::try_from(&mut dst[x..x + 32]).unwrap(),
out0
);
storeu_512!(
<&mut [i16; 32]>::try_from(&mut dst[x + 32..x + 64]).unwrap(),
out1
);
x += 64;
}
while x < w {
let x0 = src[x] as i32;
let x1 = src[x + 1] as i32;
let pixel = (16 - mx as i32) * x0 + mx as i32 * x1;
let result = if sh > 0 {
(pixel + (1 << (sh - 1))) >> sh
} else {
pixel
};
dst[x] = result as i16;
x += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_bilin_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
mid: &[&[i16]],
w: usize,
my: usize,
sh: u8,
_bd_max: i16,
) {
let mut dst = dst.flex_mut();
let my = my as i32;
let coeff0 = 16 - my;
let coeff1 = my;
let c0_32 = _mm512_set1_epi32(coeff0);
let c1_32 = _mm512_set1_epi32(coeff1);
let rnd = _mm512_set1_epi32(if sh > 0 { 1 << (sh - 1) } else { 0 });
let zero = _mm512_setzero_si512();
let max_32 = _mm512_set1_epi32(255);
let sh_reg = _mm_cvtsi32_si128(sh as i32);
let mut x = 0;
while x + 16 <= w {
let row0 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[0][x..x + 16]).unwrap()
));
let row1 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[1][x..x + 16]).unwrap()
));
let mul0 = _mm512_mullo_epi32(row0, c0_32);
let mul1 = _mm512_mullo_epi32(row1, c1_32);
let sum = _mm512_add_epi32(mul0, mul1);
let result = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), sh_reg);
let result = _mm512_max_epi32(result, zero);
let result = _mm512_min_epi32(result, max_32);
let packed = _mm512_cvtusepi32_epi8(result);
storeu_128!(
<&mut [u8; 16]>::try_from(&mut dst[x..x + 16]).unwrap(),
packed
);
x += 16;
}
while x < w {
let r0 = mid[0][x] as i32;
let r1 = mid[1][x] as i32;
let pixel = coeff0 * r0 + coeff1 * r1;
let result = if sh > 0 {
((pixel + (1 << (sh - 1))) >> sh).clamp(0, 255)
} else {
pixel.clamp(0, 255)
};
dst[x] = result as u8;
x += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_filter_bilin_8bpc_prep_avx512_inner(
_token: Server64,
dst: &mut [i16],
mid: &[&[i16]],
w: usize,
my: usize,
sh: u8,
) {
let mut dst = dst.flex_mut();
let my = my as i32;
let coeff0 = 16 - my;
let coeff1 = my;
let c0_32 = _mm512_set1_epi32(coeff0);
let c1_32 = _mm512_set1_epi32(coeff1);
let rnd = _mm512_set1_epi32(if sh > 0 { 1 << (sh - 1) } else { 0 });
let sh_reg = _mm_cvtsi32_si128(sh as i32);
let mut x = 0;
while x + 16 <= w {
let row0 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[0][x..x + 16]).unwrap()
));
let row1 = _mm512_cvtepi16_epi32(loadu_256!(
<&[i16; 16]>::try_from(&mid[1][x..x + 16]).unwrap()
));
let mul0 = _mm512_mullo_epi32(row0, c0_32);
let mul1 = _mm512_mullo_epi32(row1, c1_32);
let sum = _mm512_add_epi32(mul0, mul1);
let result = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), sh_reg);
let packed = _mm512_cvtsepi32_epi16(result);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[x..x + 16]).unwrap(),
packed
);
x += 16;
}
while x < w {
let r0 = mid[0][x] as i32;
let r1 = mid[1][x] as i32;
let pixel = coeff0 * r0 + coeff1 * r1;
let result = if sh > 0 {
(pixel + (1 << (sh - 1))) >> sh
} else {
pixel
};
dst[x] = result as i16;
x += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_bilin_8bpc_put_avx512_inner(
_token: Server64,
dst: &mut [u8],
src: &[u8],
w: usize,
mx: usize,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let mx = mx as i8;
let coeff0 = (16 - mx) as u8;
let coeff1 = mx as u8;
let coeffs = _mm512_set1_epi16(((coeff1 as i16) << 8) | (coeff0 as i16));
let rnd = _mm512_set1_epi16(8);
let sh_reg = _mm_cvtsi32_si128(4);
let mut x = 0;
while x + 64 <= w {
let src_lo = loadu_512!(&src[x..x + 64], [u8; 64]);
let src_hi = loadu_512!(&src[x + 1..x + 65], [u8; 64]);
let pairs_lo = _mm512_unpacklo_epi8(src_lo, src_hi);
let pairs_hi = _mm512_unpackhi_epi8(src_lo, src_hi);
let result_lo = _mm512_maddubs_epi16(pairs_lo, coeffs);
let result_hi = _mm512_maddubs_epi16(pairs_hi, coeffs);
let result_lo = _mm512_sra_epi16(_mm512_add_epi16(result_lo, rnd), sh_reg);
let result_hi = _mm512_sra_epi16(_mm512_add_epi16(result_hi, rnd), sh_reg);
let packed = _mm512_packus_epi16(result_lo, result_hi);
storeu_512!(&mut dst[x..x + 64], [u8; 64], packed);
x += 64;
}
while x < w {
let x0 = src[x] as i32;
let x1 = src[x + 1] as i32;
let pixel = (16 - mx as i32) * x0 + mx as i32 * x1;
let result = ((pixel + 8) >> 4).clamp(0, 255);
dst[x] = result as u8;
x += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_8bpc_direct_avx512_inner(
_token: Server64,
dst: &mut [u8],
src0: &[u8],
src1: &[u8],
w: usize,
my: usize,
) {
let mut dst = dst.flex_mut();
let src0 = src0.flex();
let src1 = src1.flex();
let my = my as i8;
let coeff0 = (16 - my) as u8;
let coeff1 = my as u8;
let coeffs = _mm512_set1_epi16(((coeff1 as i16) << 8) | (coeff0 as i16));
let rnd = _mm512_set1_epi16(8);
let sh_reg = _mm_cvtsi32_si128(4);
let mut x = 0;
while x + 64 <= w {
let s0 = loadu_512!(&src0[x..x + 64], [u8; 64]);
let s1 = loadu_512!(&src1[x..x + 64], [u8; 64]);
let pairs_lo = _mm512_unpacklo_epi8(s0, s1);
let pairs_hi = _mm512_unpackhi_epi8(s0, s1);
let result_lo = _mm512_maddubs_epi16(pairs_lo, coeffs);
let result_hi = _mm512_maddubs_epi16(pairs_hi, coeffs);
let result_lo = _mm512_sra_epi16(_mm512_add_epi16(result_lo, rnd), sh_reg);
let result_hi = _mm512_sra_epi16(_mm512_add_epi16(result_hi, rnd), sh_reg);
let packed = _mm512_packus_epi16(result_lo, result_hi);
storeu_512!(&mut dst[x..x + 64], [u8; 64], packed);
x += 64;
}
while x < w {
let x0 = src0[x] as i32;
let x1 = src1[x] as i32;
let pixel = (16 - my as i32) * x0 + my as i32 * x1;
let result = ((pixel + 8) >> 4).clamp(0, 255);
dst[x] = result as u8;
x += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_8bpc_prep_direct_avx512_inner(
_token: Server64,
dst: &mut [i16],
src0: &[u8],
src1: &[u8],
w: usize,
my: usize,
) {
let mut dst = dst.flex_mut();
let src0 = src0.flex();
let src1 = src1.flex();
let my = my as i8;
let coeff0 = (16 - my) as u8;
let coeff1 = my as u8;
let coeffs = _mm512_set1_epi16(((coeff1 as i16) << 8) | (coeff0 as i16));
let idx0 = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
let idx1 = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
let mut x = 0;
while x + 64 <= w {
let s0 = loadu_512!(&src0[x..x + 64], [u8; 64]);
let s1 = loadu_512!(&src1[x..x + 64], [u8; 64]);
let pairs_lo = _mm512_unpacklo_epi8(s0, s1);
let pairs_hi = _mm512_unpackhi_epi8(s0, s1);
let result_lo = _mm512_maddubs_epi16(pairs_lo, coeffs);
let result_hi = _mm512_maddubs_epi16(pairs_hi, coeffs);
let out0 = _mm512_permutex2var_epi64(result_lo, idx0, result_hi);
let out1 = _mm512_permutex2var_epi64(result_lo, idx1, result_hi);
storeu_512!(
<&mut [i16; 32]>::try_from(&mut dst[x..x + 32]).unwrap(),
out0
);
storeu_512!(
<&mut [i16; 32]>::try_from(&mut dst[x + 32..x + 64]).unwrap(),
out1
);
x += 64;
}
while x < w {
let x0 = src0[x] as i32;
let x1 = src1[x] as i32;
let pixel = (16 - my as i32) * x0 + my as i32 * x1;
dst[x] = pixel as i16;
x += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn put_bilin_8bpc_avx512_impl_inner(
_token: Server64,
dst: &mut [u8],
dst_stride: isize,
src: &[u8],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let intermediate_bits = 4u8;
match (mx != 0, my != 0) {
(true, true) => {
let tmp_h = h + 1;
let mut mid = take_mid_i16_130();
for y in 0..tmp_h {
let src_row_base = (y as isize * src_stride) as usize;
h_filter_bilin_8bpc_avx512_inner(
_token,
&mut mid[y],
&src[src_row_base..],
w,
mx,
4 - intermediate_bits,
);
}
for y in 0..h {
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
let mid_refs: [&[i16]; 2] = [&mid[y], &mid[y + 1]];
v_filter_bilin_8bpc_avx512_inner(
_token,
dst_row,
&mid_refs,
w,
my,
4 + intermediate_bits,
255,
);
}
put_mid_i16_130(mid);
}
(true, false) => {
for y in 0..h {
let src_row_base = (y as isize * src_stride) as usize;
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
h_bilin_8bpc_put_avx512_inner(_token, dst_row, &src[src_row_base..], w, mx);
}
}
(false, true) => {
for y in 0..h {
let src_row0 = &src[(y as isize * src_stride) as usize..];
let src_row1 = &src[((y + 1) as isize * src_stride) as usize..];
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
v_bilin_8bpc_direct_avx512_inner(_token, dst_row, src_row0, src_row1, w, my);
}
}
(false, false) => {
for y in 0..h {
let src_row_base = (y as isize * src_stride) as usize;
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
dst_row[..w].copy_from_slice(&src[src_row_base..src_row_base + w]);
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn prep_bilin_8bpc_avx512_impl_inner(
_token: Server64,
tmp: &mut [i16],
src: &[u8],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let intermediate_bits = 4u8;
match (mx != 0, my != 0) {
(true, true) => {
let tmp_h = h + 1;
let mut mid = take_mid_i16_130();
for y in 0..tmp_h {
let src_row_base = (y as isize * src_stride) as usize;
h_filter_bilin_8bpc_avx512_inner(
_token,
&mut mid[y],
&src[src_row_base..],
w,
mx,
4 - intermediate_bits,
);
}
for y in 0..h {
let dst_row = y * w;
let mid_refs: [&[i16]; 2] = [&mid[y], &mid[y + 1]];
v_filter_bilin_8bpc_prep_avx512_inner(
_token,
&mut tmp[dst_row..],
&mid_refs,
w,
my,
4,
);
}
put_mid_i16_130(mid);
}
(true, false) => {
for y in 0..h {
let src_row_base = (y as isize * src_stride) as usize;
let dst_row = y * w;
h_filter_bilin_8bpc_avx512_inner(
_token,
&mut tmp[dst_row..],
&src[src_row_base..],
w,
mx,
4 - intermediate_bits,
);
}
}
(false, true) => {
for y in 0..h {
let src_row0 = &src[(y as isize * src_stride) as usize..];
let src_row1 = &src[((y + 1) as isize * src_stride) as usize..];
let dst_row = y * w;
v_bilin_8bpc_prep_direct_avx512_inner(
_token,
&mut tmp[dst_row..],
src_row0,
src_row1,
w,
my,
);
}
}
(false, false) => {
for y in 0..h {
let src_row_base = (y as isize * src_stride) as usize;
let dst_row = y * w;
for x in 0..w {
let pixel = src[src_row_base + x] as i16;
tmp[dst_row + x] = pixel << intermediate_bits;
}
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn put_bilin_8bpc_avx2_impl_inner(
_token: Desktop64,
dst: &mut [u8],
dst_stride: isize,
src: &[u8],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let intermediate_bits = 4u8;
match (mx != 0, my != 0) {
(true, true) => {
let tmp_h = h + 1;
let mut mid = take_mid_i16_130();
for y in 0..tmp_h {
let src_row_base = (y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
h_filter_bilin_8bpc_avx2_inner(
_token,
&mut mid[y],
src_row,
w,
mx,
4 - intermediate_bits, );
}
for y in 0..h {
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
let mid_refs: [&[i16]; 2] = [&mid[y], &mid[y + 1]];
v_filter_bilin_8bpc_avx2_inner(
_token,
dst_row,
&mid_refs,
w,
my,
4 + intermediate_bits, 255,
);
}
put_mid_i16_130(mid);
}
(true, false) => {
for y in 0..h {
let src_row_base = (y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
h_bilin_8bpc_put_avx2_inner(_token, dst_row, src_row, w, mx);
}
}
(false, true) => {
for y in 0..h {
let src_row0 = &src[(y as isize * src_stride) as usize..];
let src_row1 = &src[((y + 1) as isize * src_stride) as usize..];
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
v_bilin_8bpc_direct_avx2_inner(_token, dst_row, src_row0, src_row1, w, my);
}
}
(false, false) => {
for y in 0..h {
let src_row_base = (y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
let dst_row = &mut dst[(y as isize * dst_stride) as usize..];
dst_row[..w].copy_from_slice(&src_row[..w]);
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn put_bilin_8bpc_avx2_impl(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_bilin_8bpc_avx2_impl_inner(
token, dst_ptr, dst_stride, src_ptr, src_stride, w, h, mx, my,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn put_bilin_8bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
unsafe {
put_bilin_8bpc_avx2_impl(dst_ptr, dst_stride, src_ptr, src_stride, w, h, mx, my);
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_bilin_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
_src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_bilin_8bpc_avx2_inner(
token, dst_ptr, dst_stride, src_ptr, src_stride, w, h, mx, my,
)
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn prep_bilin_8bpc_avx2_impl_inner(
_token: Desktop64,
tmp: &mut [i16],
src: &[u8],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let intermediate_bits = 4u8;
match (mx != 0, my != 0) {
(true, true) => {
let tmp_h = h + 1;
let mut mid = take_mid_i16_130();
for y in 0..tmp_h {
let src_row_base = (y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
h_filter_bilin_8bpc_avx2_inner(
_token,
&mut mid[y],
src_row,
w,
mx,
4 - intermediate_bits, );
}
for y in 0..h {
let dst_row = y * w;
for x in 0..w {
let r0 = mid[y][x] as i32;
let r1 = mid[y + 1][x] as i32;
let coeff0 = 16 - my as i32;
let coeff1 = my as i32;
let pixel = coeff0 * r0 + coeff1 * r1;
let result = (pixel + 8) >> 4;
tmp[dst_row + x] = result as i16; }
}
put_mid_i16_130(mid);
}
(true, false) => {
for y in 0..h {
let src_row_base = (y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
let dst_row = y * w;
let mut tmp_buf = [0i16; MID_STRIDE];
h_filter_bilin_8bpc_avx2_inner(
_token,
&mut tmp_buf,
src_row,
w,
mx,
4 - intermediate_bits, );
for x in 0..w {
tmp[dst_row + x] = tmp_buf[x];
}
}
}
(false, true) => {
for y in 0..h {
let dst_row = y * w;
for x in 0..w {
let r0 = src[(y as isize * src_stride + x as isize) as usize] as i32;
let r1 = src[((y + 1) as isize * src_stride + x as isize) as usize] as i32;
let coeff0 = 16 - my as i32;
let coeff1 = my as i32;
let pixel = coeff0 * r0 + coeff1 * r1;
tmp[dst_row + x] = pixel as i16;
}
}
}
(false, false) => {
for y in 0..h {
let src_row_base = (y as isize * src_stride) as usize;
let src_row = &src[src_row_base..];
let dst_row = y * w;
for x in 0..w {
let pixel = src_row[x] as i16;
tmp[dst_row + x] = pixel << intermediate_bits;
}
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn prep_bilin_8bpc_avx2_impl(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { prep_bilin_8bpc_avx2_impl_inner(token, tmp, src_ptr, src_stride, w, h, mx, my) }
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[arcane]
unsafe fn prep_bilin_8bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
unsafe {
prep_bilin_8bpc_avx2_impl(tmp, src_ptr, src_stride, w, h, mx, my);
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_bilin_8bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { prep_bilin_8bpc_avx2_inner(token, tmp, src_ptr, src_stride, w, h, mx, my) }
}
#[cfg(target_arch = "x86_64")]
use crate::src::internal::SEG_MASK_LEN;
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn w_mask_8bpc_avx2_safe_impl<const SS_HOR: bool, const SS_VER: bool>(
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
) {
let w = w as usize;
let h = h as usize;
let sign = sign as u8;
let intermediate_bits = 4u32;
let bitdepth = 8u32;
let sh = intermediate_bits + 6;
let rnd = (32i32 << intermediate_bits) + 0; let mask_sh = bitdepth + intermediate_bits - 4;
let mask_rnd = 1u16 << (mask_sh - 5);
let mask_w = if SS_HOR { w >> 1 } else { w };
let mut mask_off = 0usize;
for row_h in 0..h {
let row_offset = row_h * w;
let tmp1_row = &tmp1[row_offset..][..w];
let tmp2_row = &tmp2[row_offset..][..w];
let dst_row = &mut dst[row_h * dst_stride..][..w];
let mut x = 0;
while x < w {
let diff = tmp1_row[x].abs_diff(tmp2_row[x]);
let m = std::cmp::min(38 + ((diff.saturating_add(mask_rnd)) >> mask_sh), 64) as u8;
let t1 = tmp1_row[x] as i32;
let t2 = tmp2_row[x] as i32;
let pixel = (t1 * m as i32 + t2 * (64 - m as i32) + rnd) >> sh;
dst_row[x] = pixel.clamp(0, 255) as u8;
if SS_HOR {
x += 1;
let diff2 = tmp1_row[x].abs_diff(tmp2_row[x]);
let n = std::cmp::min(38 + ((diff2.saturating_add(mask_rnd)) >> mask_sh), 64) as u8;
let t1 = tmp1_row[x] as i32;
let t2 = tmp2_row[x] as i32;
let pixel = (t1 * n as i32 + t2 * (64 - n as i32) + rnd) >> sh;
dst_row[x] = pixel.clamp(0, 255) as u8;
let mask_x = x >> 1;
if SS_VER && (row_h & 1 != 0) {
let prev = mask[mask_off + mask_x];
mask[mask_off + mask_x] =
(((m as u16 + n as u16 + 2 - sign as u16) + prev as u16) >> 2) as u8;
} else if SS_VER {
mask[mask_off + mask_x] = m + n;
} else {
mask[mask_off + mask_x] = ((m as u16 + n as u16 + 1 - sign as u16) >> 1) as u8;
}
} else {
mask[mask_off + x] = m;
}
x += 1;
}
if !SS_VER || (row_h & 1 != 0) {
mask_off += mask_w;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_mask_444_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
) {
let mut dst = dst.flex_mut();
w_mask_8bpc_avx2_safe_impl::<false, false>(&mut *dst, dst_stride, tmp1, tmp2, w, h, mask, sign);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_mask_422_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
) {
let mut dst = dst.flex_mut();
w_mask_8bpc_avx2_safe_impl::<true, false>(&mut *dst, dst_stride, tmp1, tmp2, w, h, mask, sign);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_mask_420_8bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
) {
let mut dst = dst.flex_mut();
w_mask_8bpc_avx2_safe_impl::<true, true>(&mut *dst, dst_stride, tmp1, tmp2, w, h, mask, sign);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn w_mask_444_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
w_mask_444_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
)
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn w_mask_422_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
w_mask_422_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
)
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn w_mask_420_8bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
_bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
w_mask_420_8bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn w_mask_16bpc_avx2_safe_impl<const SS_HOR: bool, const SS_VER: bool>(
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
) {
let w = w as usize;
let h = h as usize;
let sign = sign as u8;
let bd_max = bitdepth_max as i32;
let bitdepth = if bitdepth_max == 1023 { 10u32 } else { 12u32 };
let intermediate_bits = (14 - bitdepth) as u32; let sh = intermediate_bits + 6;
let rnd = (32i32 << intermediate_bits) + 8192 * 64;
let mask_sh = bitdepth + intermediate_bits - 4;
let mask_rnd = 1u16 << (mask_sh - 5);
let mask_w = if SS_HOR { w >> 1 } else { w };
let mut mask_off = 0usize;
for row_h in 0..h {
let row_offset = row_h * w;
let tmp1_row = &tmp1[row_offset..][..w];
let tmp2_row = &tmp2[row_offset..][..w];
let dst_row_bytes = &mut dst[row_h * dst_stride..][..w * 2];
let dst_row: &mut [u16] = zerocopy::FromBytes::mut_from_bytes(dst_row_bytes).unwrap();
let mut x = 0;
while x < w {
let diff = tmp1_row[x].abs_diff(tmp2_row[x]);
let m = std::cmp::min(38 + ((diff.saturating_add(mask_rnd)) >> mask_sh), 64) as u8;
let t1 = tmp1_row[x] as i32;
let t2 = tmp2_row[x] as i32;
let pixel = (t1 * m as i32 + t2 * (64 - m as i32) + rnd) >> sh;
dst_row[x] = pixel.clamp(0, bd_max) as u16;
if SS_HOR {
x += 1;
let diff2 = tmp1_row[x].abs_diff(tmp2_row[x]);
let n = std::cmp::min(38 + ((diff2.saturating_add(mask_rnd)) >> mask_sh), 64) as u8;
let t1 = tmp1_row[x] as i32;
let t2 = tmp2_row[x] as i32;
let pixel = (t1 * n as i32 + t2 * (64 - n as i32) + rnd) >> sh;
dst_row[x] = pixel.clamp(0, bd_max) as u16;
let mask_x = x >> 1;
if SS_VER && (row_h & 1 != 0) {
let prev = mask[mask_off + mask_x];
mask[mask_off + mask_x] =
(((m as u16 + n as u16 + 2 - sign as u16) + prev as u16) >> 2) as u8;
} else if SS_VER {
mask[mask_off + mask_x] = m + n;
} else {
mask[mask_off + mask_x] = ((m as u16 + n as u16 + 1 - sign as u16) >> 1) as u8;
}
} else {
mask[mask_off + x] = m;
}
x += 1;
}
if !SS_VER || (row_h & 1 != 0) {
mask_off += mask_w;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_mask_444_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
w_mask_16bpc_avx2_safe_impl::<false, false>(
&mut *dst,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_mask_422_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
w_mask_16bpc_avx2_safe_impl::<true, false>(
&mut *dst,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn w_mask_420_16bpc_avx2_safe(
_token: Desktop64,
dst: &mut [u8],
dst_stride: usize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
w_mask_16bpc_avx2_safe_impl::<true, true>(
&mut *dst,
dst_stride,
tmp1,
tmp2,
w,
h,
mask,
sign,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn w_mask_444_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
w_mask_444_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
bitdepth_max,
)
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn w_mask_422_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
w_mask_422_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
bitdepth_max,
)
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn w_mask_420_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
) {
let dst = unsafe {
std::slice::from_raw_parts_mut(dst_ptr as *mut u8, h as usize * dst_stride as usize)
};
w_mask_420_16bpc_avx2_safe(
Desktop64::forge_token_dangerously(),
dst,
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
bitdepth_max,
)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_bilin_16bpc_avx2_inner(_token: Desktop64, dst: &mut [i32], src: &[u16], w: usize, mx: i32) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm256_set1_epi32(16 - mx);
let w1 = _mm256_set1_epi32(mx);
let mut col = 0usize;
while col + 8 <= w {
let p0_7 = loadu_128!(<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap()); let p1_8 = loadu_128!(<&[u16; 8]>::try_from(&src[col + 1..col + 9]).unwrap());
let p0_lo = _mm256_cvtepu16_epi32(p0_7);
let p1_lo = _mm256_cvtepu16_epi32(p1_8);
let term0 = _mm256_mullo_epi32(p0_lo, w0);
let term1 = _mm256_mullo_epi32(p1_lo, w1);
let result = _mm256_add_epi32(term0, term1);
storeu_256!(
<&mut [i32; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
result
);
col += 8;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[col + 1] as i32;
dst[col] = 16 * x0 + mx * (x1 - x0);
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_bilin_16bpc_avx2(dst: *mut i32, src: *const u16, w: usize, mx: i32) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_bilin_16bpc_avx2_inner(token, dst, src, w, mx) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_16bpc_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
my: i32,
sh: i32,
max: i32,
) {
let mut dst = dst.flex_mut();
let w0 = _mm256_set1_epi32(16 - my);
let w1 = _mm256_set1_epi32(my);
let rnd = _mm256_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi32(max);
let mut col = 0usize;
while col + 8 <= w {
let r0 = loadu_256!(<&[i32; 8]>::try_from(&mid[y][col..col + 8]).unwrap());
let r1 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 1][col..col + 8]).unwrap());
let term0 = _mm256_mullo_epi32(r0, w0);
let term1 = _mm256_mullo_epi32(r1, w1);
let sum = _mm256_add_epi32(term0, term1);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let clamped = _mm256_min_epi32(_mm256_max_epi32(shifted, zero), max_val);
let packed = _mm256_packus_epi32(clamped, zero);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [u16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let r0 = mid[y][col];
let r1 = mid[y + 1][col];
let pixel = 16 * r0 + my * (r1 - r0);
let r = (1 << sh) >> 1;
let val = ((pixel + r) >> sh).clamp(0, max);
dst[col] = val as u16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_bilin_16bpc_avx2(
dst: *mut u16,
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
my: i32,
sh: i32,
max: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_bilin_16bpc_avx2_inner(token, dst, mid, w, y, my, sh, max) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_16bpc_prep_avx2_inner(
_token: Desktop64,
dst: &mut [i16],
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
my: i32,
sh: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let w0 = _mm256_set1_epi32(16 - my);
let w1 = _mm256_set1_epi32(my);
let rnd = _mm256_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let bias = _mm256_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 8 <= w {
let r0 = loadu_256!(<&[i32; 8]>::try_from(&mid[y][col..col + 8]).unwrap());
let r1 = loadu_256!(<&[i32; 8]>::try_from(&mid[y + 1][col..col + 8]).unwrap());
let term0 = _mm256_mullo_epi32(r0, w0);
let term1 = _mm256_mullo_epi32(r1, w1);
let sum = _mm256_add_epi32(term0, term1);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let biased = _mm256_sub_epi32(shifted, bias);
let packed = _mm256_packs_epi32(biased, biased);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let r0 = mid[y][col];
let r1 = mid[y + 1][col];
let pixel = 16 * r0 + my * (r1 - r0);
let r = (1 << sh) >> 1;
let val = ((pixel + r) >> sh) - prep_bias;
dst[col] = val as i16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_bilin_16bpc_prep_avx2(
dst: *mut i16,
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
my: i32,
sh: i32,
prep_bias: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_bilin_16bpc_prep_avx2_inner(token, dst, mid, w, y, my, sh, prep_bias) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_bilin_16bpc_put_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
src: &[u16],
w: usize,
mx: i32,
bd_max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm256_set1_epi32(16 - mx);
let w1 = _mm256_set1_epi32(mx);
let rnd = _mm256_set1_epi32(8);
let shift_count = _mm_cvtsi32_si128(4);
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi32(bd_max);
let mut col = 0usize;
while col + 8 <= w {
let p0 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap()
));
let p1 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[col + 1..col + 9]).unwrap()
));
let term0 = _mm256_mullo_epi32(p0, w0);
let term1 = _mm256_mullo_epi32(p1, w1);
let sum = _mm256_add_epi32(term0, term1);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let clamped = _mm256_min_epi32(_mm256_max_epi32(shifted, zero), max_val);
let packed = _mm256_packus_epi32(clamped, zero);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [u16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[col + 1] as i32;
let pixel = (16 - mx) * x0 + mx * x1;
let result = ((pixel + 8) >> 4).clamp(0, bd_max);
dst[col] = result as u16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_bilin_16bpc_put_avx2(dst: *mut u16, src: *const u16, w: usize, mx: i32, bd_max: i32) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_bilin_16bpc_put_avx2_inner(token, dst, src, w, mx, bd_max) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_16bpc_direct_avx2_inner(
_token: Desktop64,
dst: &mut [u16],
src: &[u16],
src_stride: isize,
w: usize,
my: i32,
bd_max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm256_set1_epi32(16 - my);
let w1 = _mm256_set1_epi32(my);
let rnd = _mm256_set1_epi32(8);
let shift_count = _mm_cvtsi32_si128(4);
let zero = _mm256_setzero_si256();
let max_val = _mm256_set1_epi32(bd_max);
let mut col = 0usize;
while col + 8 <= w {
let p0 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap()
));
let p1 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[src_stride as usize + col..src_stride as usize + col + 8])
.unwrap()
));
let term0 = _mm256_mullo_epi32(p0, w0);
let term1 = _mm256_mullo_epi32(p1, w1);
let sum = _mm256_add_epi32(term0, term1);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let clamped = _mm256_min_epi32(_mm256_max_epi32(shifted, zero), max_val);
let packed = _mm256_packus_epi32(clamped, zero);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [u16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[src_stride as usize + col] as i32;
let pixel = (16 - my) * x0 + my * x1;
let result = ((pixel + 8) >> 4).clamp(0, bd_max);
dst[col] = result as u16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_bilin_16bpc_direct_avx2(
dst: *mut u16,
src: *const u16,
src_stride: isize,
w: usize,
my: i32,
bd_max: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_bilin_16bpc_direct_avx2_inner(token, dst, src, src_stride, w, my, bd_max) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_bilin_16bpc_prep_direct_avx2_inner(
_token: Desktop64,
dst: &mut [i16],
src: &[u16],
w: usize,
mx: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm256_set1_epi32(16 - mx);
let w1 = _mm256_set1_epi32(mx);
let rnd = _mm256_set1_epi32(8);
let shift_count = _mm_cvtsi32_si128(4);
let bias = _mm256_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 8 <= w {
let p0 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap()
));
let p1 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[col + 1..col + 9]).unwrap()
));
let term0 = _mm256_mullo_epi32(p0, w0);
let term1 = _mm256_mullo_epi32(p1, w1);
let sum = _mm256_add_epi32(term0, term1);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let biased = _mm256_sub_epi32(shifted, bias);
let packed = _mm256_packs_epi32(biased, biased);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[col + 1] as i32;
let pixel = (16 - mx) * x0 + mx * x1;
let result = ((pixel + 8) >> 4) - prep_bias;
dst[col] = result as i16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn h_bilin_16bpc_prep_direct_avx2(
dst: *mut i16,
src: *const u16,
w: usize,
mx: i32,
prep_bias: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { h_bilin_16bpc_prep_direct_avx2_inner(token, dst, src, w, mx, prep_bias) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_16bpc_prep_direct_avx2_inner(
_token: Desktop64,
dst: &mut [i16],
src: &[u16],
src_stride: isize,
w: usize,
my: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm256_set1_epi32(16 - my);
let w1 = _mm256_set1_epi32(my);
let rnd = _mm256_set1_epi32(8);
let shift_count = _mm_cvtsi32_si128(4);
let bias = _mm256_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 8 <= w {
let p0 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[col..col + 8]).unwrap()
));
let p1 = _mm256_cvtepu16_epi32(loadu_128!(
<&[u16; 8]>::try_from(&src[src_stride as usize + col..src_stride as usize + col + 8])
.unwrap()
));
let term0 = _mm256_mullo_epi32(p0, w0);
let term1 = _mm256_mullo_epi32(p1, w1);
let sum = _mm256_add_epi32(term0, term1);
let shifted = _mm256_sra_epi32(_mm256_add_epi32(sum, rnd), shift_count);
let biased = _mm256_sub_epi32(shifted, bias);
let packed = _mm256_packs_epi32(biased, biased);
let result = _mm256_permute4x64_epi64(packed, 0b00_00_10_00);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut dst[col..col + 8]).unwrap(),
_mm256_castsi256_si128(result)
);
col += 8;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[src_stride as usize + col] as i32;
let pixel = (16 - my) * x0 + my * x1;
let result = ((pixel + 8) >> 4) - prep_bias;
dst[col] = result as i16;
col += 1;
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn v_bilin_16bpc_prep_direct_avx2(
dst: *mut i16,
src: *const u16,
src_stride: isize,
w: usize,
my: i32,
prep_bias: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { v_bilin_16bpc_prep_direct_avx2_inner(token, dst, src, src_stride, w, my, prep_bias) }
}
#[cfg(target_arch = "x86_64")]
#[cfg(feature = "asm")]
#[arcane]
unsafe fn put_bilin_16bpc_avx2_inner(
_token: Desktop64,
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
) {
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let dst = dst_ptr as *mut u16;
let src = src_ptr as *const u16;
let dst_stride = dst_stride / 2; let src_stride = src_stride / 2;
let bd_max = bitdepth_max as i32;
let intermediate_bits = 4i32;
let _h_pass_sh = 4 - intermediate_bits; let v_pass_sh = 4 + intermediate_bits;
unsafe {
if mx != 0 {
if my != 0 {
let tmp_h = h + 1;
let mut mid = take_mid_i32_130();
for y in 0..tmp_h {
let src_row = src.offset(y as isize * src_stride);
h_bilin_16bpc_avx2_inner(_token, &mut mid[y], src_row, w, mx as i32);
}
for y in 0..h {
let dst_row = dst.offset(y as isize * dst_stride);
v_bilin_16bpc_avx2_inner(
_token, dst_row, &*mid, w, y, my as i32, v_pass_sh, bd_max,
);
}
put_mid_i32_130(mid);
} else {
for y in 0..h {
let src_row = src.offset(y as isize * src_stride);
let dst_row = dst.offset(y as isize * dst_stride);
h_bilin_16bpc_put_avx2_inner(_token, dst_row, src_row, w, mx as i32, bd_max);
}
}
} else if my != 0 {
for y in 0..h {
let src_row = src.offset(y as isize * src_stride);
let dst_row = dst.offset(y as isize * dst_stride);
v_bilin_16bpc_direct_avx2_inner(
_token, dst_row, src_row, src_stride, w, my as i32, bd_max,
);
}
} else {
for y in 0..h {
let src_row = src.offset(y as isize * src_stride);
let dst_row = dst.offset(y as isize * dst_stride);
std::ptr::copy_nonoverlapping(src_row, dst_row, w);
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn put_bilin_16bpc_avx2_impl(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_bilin_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
)
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn put_bilin_16bpc_avx2(
dst_ptr: *mut DynPixel,
dst_stride: isize,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
_dst: *const FFISafe<PicOffset>,
_src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe {
put_bilin_16bpc_avx2_inner(
token,
dst_ptr,
dst_stride,
src_ptr,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
)
}
}
#[cfg(target_arch = "x86_64")]
#[cfg(feature = "asm")]
#[arcane]
unsafe fn prep_bilin_16bpc_avx2_inner(
_token: Desktop64,
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
let w = w as usize;
let h = h as usize;
let mx = mx as usize;
let my = my as usize;
let src = src_ptr as *const u16;
let src_stride = src_stride / 2;
let prep_bias = 8192i32;
let intermediate_bits = 4i32;
let _h_pass_sh = 4 - intermediate_bits; let v_pass_sh = 4 + intermediate_bits;
unsafe {
if mx != 0 {
if my != 0 {
let tmp_h = h + 1;
let mut mid = take_mid_i32_130();
for y in 0..tmp_h {
let src_row = src.offset(y as isize * src_stride);
h_bilin_16bpc_avx2_inner(_token, &mut mid[y], src_row, w, mx as i32);
}
for y in 0..h {
let dst_row = tmp.add(y * w);
v_bilin_16bpc_prep_avx2_inner(
_token, dst_row, &*mid, w, y, my as i32, v_pass_sh, prep_bias,
);
}
put_mid_i32_130(mid);
} else {
for y in 0..h {
let src_row = src.offset(y as isize * src_stride);
let dst_row = tmp.add(y * w);
h_bilin_16bpc_prep_direct_avx2_inner(
_token, dst_row, src_row, w, mx as i32, prep_bias,
);
}
}
} else if my != 0 {
for y in 0..h {
let src_row = src.offset(y as isize * src_stride);
let dst_row = tmp.add(y * w);
v_bilin_16bpc_prep_direct_avx2_inner(
_token, dst_row, src_row, src_stride, w, my as i32, prep_bias,
);
}
for y in 0..h {
let src_row = src.offset(y as isize * src_stride);
let dst_row = tmp.add(y * w);
for x in 0..w {
let pixel = src_row[x] as i32;
*dst_row.add(x) = (pixel - prep_bias) as i16;
}
}
}
}
}
#[cfg(feature = "asm")]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn prep_bilin_16bpc_avx2_impl(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { prep_bilin_16bpc_avx2_inner(token, tmp, src_ptr, src_stride, w, h, mx, my) }
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn prep_bilin_16bpc_avx2(
tmp: *mut i16,
src_ptr: *const DynPixel,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
_bitdepth_max: i32,
_src: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
unsafe { prep_bilin_16bpc_avx2_inner(token, tmp, src_ptr, src_stride, w, h, mx, my) }
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_bilin_16bpc_avx512_inner(_token: Server64, dst: &mut [i32], src: &[u16], w: usize, mx: i32) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm512_set1_epi32(16 - mx);
let w1 = _mm512_set1_epi32(mx);
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 1..col + 17]).unwrap()
));
let term0 = _mm512_mullo_epi32(p0, w0);
let term1 = _mm512_mullo_epi32(p1, w1);
let result = _mm512_add_epi32(term0, term1);
storeu_512!(&mut dst[col..col + 16], [i32; 16], result);
col += 16;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[col + 1] as i32;
dst[col] = 16 * x0 + mx * (x1 - x0);
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u16],
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
my: i32,
sh: i32,
max: i32,
) {
let mut dst = dst.flex_mut();
let w0 = _mm512_set1_epi32(16 - my);
let w1 = _mm512_set1_epi32(my);
let rnd = _mm512_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let zero = _mm512_setzero_si512();
let max_val = _mm512_set1_epi32(max);
let mut col = 0usize;
while col + 16 <= w {
let r0 = loadu_512!(&mid[y][col..col + 16], [i32; 16]);
let r1 = loadu_512!(&mid[y + 1][col..col + 16], [i32; 16]);
let term0 = _mm512_mullo_epi32(r0, w0);
let term1 = _mm512_mullo_epi32(r1, w1);
let sum = _mm512_add_epi32(term0, term1);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let clamped = _mm512_min_epi32(_mm512_max_epi32(shifted, zero), max_val);
let packed = _mm512_cvtusepi32_epi16(clamped);
storeu_256!(
<&mut [u16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let r0 = mid[y][col];
let r1 = mid[y + 1][col];
let pixel = 16 * r0 + my * (r1 - r0);
let r = (1 << sh) >> 1;
let val = ((pixel + r) >> sh).clamp(0, max);
dst[col] = val as u16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_16bpc_prep_avx512_inner(
_token: Server64,
dst: &mut [i16],
mid: &[[i32; MID_STRIDE]],
w: usize,
y: usize,
my: i32,
sh: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let w0 = _mm512_set1_epi32(16 - my);
let w1 = _mm512_set1_epi32(my);
let rnd = _mm512_set1_epi32((1 << sh) >> 1);
let shift_count = _mm_cvtsi32_si128(sh);
let bias = _mm512_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 16 <= w {
let r0 = loadu_512!(&mid[y][col..col + 16], [i32; 16]);
let r1 = loadu_512!(&mid[y + 1][col..col + 16], [i32; 16]);
let term0 = _mm512_mullo_epi32(r0, w0);
let term1 = _mm512_mullo_epi32(r1, w1);
let sum = _mm512_add_epi32(term0, term1);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let biased = _mm512_sub_epi32(shifted, bias);
let packed = _mm512_cvtsepi32_epi16(biased);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let r0 = mid[y][col];
let r1 = mid[y + 1][col];
let pixel = 16 * r0 + my * (r1 - r0);
let r = (1 << sh) >> 1;
let val = ((pixel + r) >> sh) - prep_bias;
dst[col] = val as i16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_bilin_16bpc_put_avx512_inner(
_token: Server64,
dst: &mut [u16],
src: &[u16],
w: usize,
mx: i32,
bd_max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm512_set1_epi32(16 - mx);
let w1 = _mm512_set1_epi32(mx);
let rnd = _mm512_set1_epi32(8);
let shift_count = _mm_cvtsi32_si128(4);
let zero = _mm512_setzero_si512();
let max_val = _mm512_set1_epi32(bd_max);
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 1..col + 17]).unwrap()
));
let term0 = _mm512_mullo_epi32(p0, w0);
let term1 = _mm512_mullo_epi32(p1, w1);
let sum = _mm512_add_epi32(term0, term1);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let clamped = _mm512_min_epi32(_mm512_max_epi32(shifted, zero), max_val);
let packed = _mm512_cvtusepi32_epi16(clamped);
storeu_256!(
<&mut [u16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[col + 1] as i32;
let pixel = (16 - mx) * x0 + mx * x1;
let result = ((pixel + 8) >> 4).clamp(0, bd_max);
dst[col] = result as u16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_16bpc_direct_avx512_inner(
_token: Server64,
dst: &mut [u16],
src: &[u16],
src_stride: isize,
w: usize,
my: i32,
bd_max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm512_set1_epi32(16 - my);
let w1 = _mm512_set1_epi32(my);
let rnd = _mm512_set1_epi32(8);
let shift_count = _mm_cvtsi32_si128(4);
let zero = _mm512_setzero_si512();
let max_val = _mm512_set1_epi32(bd_max);
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[src_stride as usize + col..src_stride as usize + col + 16])
.unwrap()
));
let term0 = _mm512_mullo_epi32(p0, w0);
let term1 = _mm512_mullo_epi32(p1, w1);
let sum = _mm512_add_epi32(term0, term1);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let clamped = _mm512_min_epi32(_mm512_max_epi32(shifted, zero), max_val);
let packed = _mm512_cvtusepi32_epi16(clamped);
storeu_256!(
<&mut [u16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[src_stride as usize + col] as i32;
let pixel = (16 - my) * x0 + my * x1;
let result = ((pixel + 8) >> 4).clamp(0, bd_max);
dst[col] = result as u16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn h_bilin_16bpc_prep_direct_avx512_inner(
_token: Server64,
dst: &mut [i16],
src: &[u16],
w: usize,
mx: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm512_set1_epi32(16 - mx);
let w1 = _mm512_set1_epi32(mx);
let rnd = _mm512_set1_epi32(8);
let shift_count = _mm_cvtsi32_si128(4);
let bias = _mm512_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col + 1..col + 17]).unwrap()
));
let term0 = _mm512_mullo_epi32(p0, w0);
let term1 = _mm512_mullo_epi32(p1, w1);
let sum = _mm512_add_epi32(term0, term1);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let biased = _mm512_sub_epi32(shifted, bias);
let packed = _mm512_cvtsepi32_epi16(biased);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[col + 1] as i32;
let pixel = (16 - mx) * x0 + mx * x1;
let result = ((pixel + 8) >> 4) - prep_bias;
dst[col] = result as i16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn v_bilin_16bpc_prep_direct_avx512_inner(
_token: Server64,
dst: &mut [i16],
src: &[u16],
src_stride: isize,
w: usize,
my: i32,
prep_bias: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w0 = _mm512_set1_epi32(16 - my);
let w1 = _mm512_set1_epi32(my);
let rnd = _mm512_set1_epi32(8);
let shift_count = _mm_cvtsi32_si128(4);
let bias = _mm512_set1_epi32(prep_bias);
let mut col = 0usize;
while col + 16 <= w {
let p0 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[col..col + 16]).unwrap()
));
let p1 = _mm512_cvtepu16_epi32(loadu_256!(
<&[u16; 16]>::try_from(&src[src_stride as usize + col..src_stride as usize + col + 16])
.unwrap()
));
let term0 = _mm512_mullo_epi32(p0, w0);
let term1 = _mm512_mullo_epi32(p1, w1);
let sum = _mm512_add_epi32(term0, term1);
let shifted = _mm512_sra_epi32(_mm512_add_epi32(sum, rnd), shift_count);
let biased = _mm512_sub_epi32(shifted, bias);
let packed = _mm512_cvtsepi32_epi16(biased);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut dst[col..col + 16]).unwrap(),
packed
);
col += 16;
}
while col < w {
let x0 = src[col] as i32;
let x1 = src[src_stride as usize + col] as i32;
let pixel = (16 - my) * x0 + my * x1;
let result = ((pixel + 8) >> 4) - prep_bias;
dst[col] = result as i16;
col += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn put_bilin_16bpc_avx512_impl_inner(
_token: Server64,
dst: &mut [u16],
dst_stride: isize,
src: &[u16],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let bd_max = bitdepth_max;
let v_pass_sh = 8;
match (mx != 0, my != 0) {
(true, true) => {
let tmp_h = h + 1;
let mut mid = take_mid_i32_130();
for y in 0..tmp_h {
let src_off = (y as isize * src_stride) as usize;
h_bilin_16bpc_avx512_inner(_token, &mut mid[y], &src[src_off..], w, mx);
}
for y in 0..h {
let dst_off = (y as isize * dst_stride) as usize;
v_bilin_16bpc_avx512_inner(
_token,
&mut dst[dst_off..],
&*mid,
w,
y,
my,
v_pass_sh,
bd_max,
);
}
put_mid_i32_130(mid);
}
(true, false) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_off = (y as isize * dst_stride) as usize;
h_bilin_16bpc_put_avx512_inner(
_token,
&mut dst[dst_off..],
&src[src_off..],
w,
mx,
bd_max,
);
}
}
(false, true) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_off = (y as isize * dst_stride) as usize;
v_bilin_16bpc_direct_avx512_inner(
_token,
&mut dst[dst_off..],
&src[src_off..],
src_stride,
w,
my,
bd_max,
);
}
}
(false, false) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_off = (y as isize * dst_stride) as usize;
dst[dst_off..dst_off + w].copy_from_slice(&src[src_off..src_off + w]);
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn prep_bilin_16bpc_avx512_impl_inner(
_token: Server64,
tmp: &mut [i16],
src: &[u16],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let prep_bias = 8192i32;
let v_pass_sh = 8;
match (mx != 0, my != 0) {
(true, true) => {
let tmp_h = h + 1;
let mut mid = take_mid_i32_130();
for y in 0..tmp_h {
let src_off = (y as isize * src_stride) as usize;
h_bilin_16bpc_avx512_inner(_token, &mut mid[y], &src[src_off..], w, mx);
}
for y in 0..h {
let dst_row = y * w;
v_bilin_16bpc_prep_avx512_inner(
_token,
&mut tmp[dst_row..],
&*mid,
w,
y,
my,
v_pass_sh,
prep_bias,
);
}
put_mid_i32_130(mid);
}
(true, false) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_row = y * w;
h_bilin_16bpc_prep_direct_avx512_inner(
_token,
&mut tmp[dst_row..],
&src[src_off..],
w,
mx,
prep_bias,
);
}
}
(false, true) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_row = y * w;
v_bilin_16bpc_prep_direct_avx512_inner(
_token,
&mut tmp[dst_row..],
&src[src_off..],
src_stride,
w,
my,
prep_bias,
);
}
}
(false, false) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_row = y * w;
for x in 0..w {
let pixel = src[src_off + x] as i32;
tmp[dst_row + x] = (pixel - prep_bias) as i16;
}
}
}
}
let _ = bitdepth_max;
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn put_bilin_16bpc_avx2_impl_inner_safe(
_token: Desktop64,
dst: &mut [u16],
dst_stride: isize,
src: &[u16],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let bd_max = bitdepth_max;
let v_pass_sh = 8;
match (mx != 0, my != 0) {
(true, true) => {
let tmp_h = h + 1;
let mut mid = take_mid_i32_130();
for y in 0..tmp_h {
let src_off = (y as isize * src_stride) as usize;
h_bilin_16bpc_avx2_inner(_token, &mut mid[y], &src[src_off..], w, mx);
}
for y in 0..h {
let dst_off = (y as isize * dst_stride) as usize;
v_bilin_16bpc_avx2_inner(
_token,
&mut dst[dst_off..],
&*mid,
w,
y,
my,
v_pass_sh,
bd_max,
);
}
put_mid_i32_130(mid);
}
(true, false) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_off = (y as isize * dst_stride) as usize;
h_bilin_16bpc_put_avx2_inner(
_token,
&mut dst[dst_off..],
&src[src_off..],
w,
mx,
bd_max,
);
}
}
(false, true) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_off = (y as isize * dst_stride) as usize;
v_bilin_16bpc_direct_avx2_inner(
_token,
&mut dst[dst_off..],
&src[src_off..],
src_stride,
w,
my,
bd_max,
);
}
}
(false, false) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_off = (y as isize * dst_stride) as usize;
dst[dst_off..dst_off + w].copy_from_slice(&src[src_off..src_off + w]);
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn prep_bilin_16bpc_avx2_impl_inner_safe(
_token: Desktop64,
tmp: &mut [i16],
src: &[u16],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
) {
let mut tmp = tmp.flex_mut();
let src = src.flex();
let w = w as usize;
let h = h as usize;
let prep_bias = 8192i32;
let v_pass_sh = 8;
match (mx != 0, my != 0) {
(true, true) => {
let tmp_h = h + 1;
let mut mid = take_mid_i32_130();
for y in 0..tmp_h {
let src_off = (y as isize * src_stride) as usize;
h_bilin_16bpc_avx2_inner(_token, &mut mid[y], &src[src_off..], w, mx);
}
for y in 0..h {
let dst_row = y * w;
v_bilin_16bpc_prep_avx2_inner(
_token,
&mut tmp[dst_row..],
&*mid,
w,
y,
my,
v_pass_sh,
prep_bias,
);
}
put_mid_i32_130(mid);
}
(true, false) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_row = y * w;
h_bilin_16bpc_prep_direct_avx2_inner(
_token,
&mut tmp[dst_row..],
&src[src_off..],
w,
mx,
prep_bias,
);
}
}
(false, true) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_row = y * w;
v_bilin_16bpc_prep_direct_avx2_inner(
_token,
&mut tmp[dst_row..],
&src[src_off..],
src_stride,
w,
my,
prep_bias,
);
}
}
(false, false) => {
for y in 0..h {
let src_off = (y as isize * src_stride) as usize;
let dst_row = y * w;
for x in 0..w {
let pixel = src[src_off + x] as i32;
tmp[dst_row + x] = (pixel - prep_bias) as i16;
}
}
}
}
let _ = bitdepth_max;
}
#[cfg(target_arch = "x86_64")]
pub fn avg_dispatch<BD: BitDepth>(
dst: PicOffset,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bd: BD,
) -> bool {
let Some(_token) = crate::src::cpu::summon_avx2() else {
return false;
};
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w as usize,
h as usize,
|dst_bytes, dst_offset, dst_stride| {
avg_dispatch_inner::<BD>(dst_bytes, dst_offset, dst_stride, tmp1, tmp2, w, h, bd);
},
);
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn avg_dispatch_inner<BD: BitDepth>(
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
let avx512_token = crate::src::cpu::summon_avx512();
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let bd_c = bd.into_c();
match BD::BPC {
BPC::BPC8 => {
if let Some(t512) = avx512_token {
avg_8bpc_avx512_safe(
t512,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
);
} else {
avg_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
);
}
}
BPC::BPC16 => {
if let Some(t512) = avx512_token {
avg_16bpc_avx512_safe(
t512,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
bd_c,
);
} else {
avg_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
bd_c,
);
}
}
}
true
}
#[cfg(target_arch = "x86_64")]
pub fn w_avg_dispatch<BD: BitDepth>(
dst: PicOffset,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
bd: BD,
) -> bool {
let Some(_token) = crate::src::cpu::summon_avx2() else {
return false;
};
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w as usize,
h as usize,
|dst_bytes, dst_offset, dst_stride| {
w_avg_dispatch_inner::<BD>(
dst_bytes, dst_offset, dst_stride, tmp1, tmp2, w, h, weight, bd,
);
},
);
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn w_avg_dispatch_inner<BD: BitDepth>(
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
weight: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
let avx512_token = crate::src::cpu::summon_avx512();
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let bd_c = bd.into_c();
match BD::BPC {
BPC::BPC8 => {
if let Some(t512) = avx512_token {
w_avg_8bpc_avx512_safe(
t512,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
weight,
);
} else {
w_avg_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
weight,
);
}
}
BPC::BPC16 => {
if let Some(t512) = avx512_token {
w_avg_16bpc_avx512_safe(
t512,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
weight,
bd_c,
);
} else {
w_avg_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
weight,
bd_c,
);
}
}
}
true
}
#[cfg(target_arch = "x86_64")]
pub fn mask_dispatch<BD: BitDepth>(
dst: PicOffset,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &[u8],
bd: BD,
) -> bool {
let Some(_token) = crate::src::cpu::summon_avx2() else {
return false;
};
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w as usize,
h as usize,
|dst_bytes, dst_offset, dst_stride| {
mask_dispatch_inner::<BD>(
dst_bytes, dst_offset, dst_stride, tmp1, tmp2, w, h, mask, bd,
);
},
);
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn mask_dispatch_inner<BD: BitDepth>(
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &[u8],
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
let avx512_token = crate::src::cpu::summon_avx512();
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let bd_c = bd.into_c();
match BD::BPC {
BPC::BPC8 => {
if let Some(t512) = avx512_token {
mask_8bpc_avx512_safe(
t512,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
);
} else {
mask_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
);
}
}
BPC::BPC16 => {
if let Some(t512) = avx512_token {
mask_16bpc_avx512_safe(
t512,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
bd_c,
);
} else {
mask_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
bd_c,
);
}
}
}
true
}
#[cfg(target_arch = "x86_64")]
pub fn blend_dispatch<BD: BitDepth>(
dst: PicOffset,
tmp: &[BD::Pixel; SCRATCH_INTER_INTRA_BUF_LEN],
w: i32,
h: i32,
mask: &[u8],
) -> bool {
let Some(_token) = crate::src::cpu::summon_avx2() else {
return false;
};
use zerocopy::IntoBytes;
let tmp_bytes = tmp.as_bytes();
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w as usize,
h as usize,
|dst_bytes, dst_offset, dst_stride| {
blend_dispatch_inner::<BD>(dst_bytes, dst_offset, dst_stride, tmp_bytes, w, h, mask);
},
);
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn blend_dispatch_inner<BD: BitDepth>(
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp_bytes: &[u8],
w: i32,
h: i32,
mask: &[u8],
) -> bool {
use crate::include::common::bitdepth::BPC;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
match BD::BPC {
BPC::BPC8 => blend_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp_bytes,
w,
h,
mask,
),
BPC::BPC16 => blend_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp_bytes,
w,
h,
mask,
),
}
true
}
#[cfg(target_arch = "x86_64")]
pub fn blend_dir_dispatch<BD: BitDepth>(
is_h: bool,
dst: PicOffset,
tmp: &[BD::Pixel; SCRATCH_LAP_LEN],
w: i32,
h: i32,
) -> bool {
let Some(_token) = crate::src::cpu::summon_avx2() else {
return false;
};
use zerocopy::IntoBytes;
let tmp_bytes = tmp.as_bytes();
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w as usize,
h as usize,
|dst_bytes, dst_offset, dst_stride| {
blend_dir_dispatch_inner::<BD>(
is_h, dst_bytes, dst_offset, dst_stride, tmp_bytes, w, h,
);
},
);
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn blend_dir_dispatch_inner<BD: BitDepth>(
is_h: bool,
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp_bytes: &[u8],
w: i32,
h: i32,
) -> bool {
use crate::include::common::bitdepth::BPC;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
match (BD::BPC, is_h) {
(BPC::BPC8, true) => blend_h_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp_bytes,
w,
h,
),
(BPC::BPC8, false) => blend_v_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp_bytes,
w,
h,
),
(BPC::BPC16, true) => blend_h_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp_bytes,
w,
h,
),
(BPC::BPC16, false) => blend_v_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp_bytes,
w,
h,
),
}
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn w_mask_dispatch<BD: BitDepth>(
layout: Rav1dPixelLayoutSubSampled,
dst: PicOffset,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bd: BD,
) -> bool {
let Some(_token) = crate::src::cpu::summon_avx2() else {
return false;
};
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w as usize,
h as usize,
|dst_bytes, dst_offset, dst_stride| {
w_mask_dispatch_inner::<BD>(
layout, dst_bytes, dst_offset, dst_stride, tmp1, tmp2, w, h, mask, sign, bd,
);
},
);
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn w_mask_dispatch_inner<BD: BitDepth>(
layout: Rav1dPixelLayoutSubSampled,
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
tmp1: &[i16; COMPINTER_LEN],
tmp2: &[i16; COMPINTER_LEN],
w: i32,
h: i32,
mask: &mut [u8; SEG_MASK_LEN],
sign: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let bd_c = bd.into_c();
match (BD::BPC, layout) {
(BPC::BPC8, Rav1dPixelLayoutSubSampled::I420) => w_mask_420_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
),
(BPC::BPC8, Rav1dPixelLayoutSubSampled::I422) => w_mask_422_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
),
(BPC::BPC8, Rav1dPixelLayoutSubSampled::I444) => w_mask_444_8bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
),
(BPC::BPC16, Rav1dPixelLayoutSubSampled::I420) => w_mask_420_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
),
(BPC::BPC16, Rav1dPixelLayoutSubSampled::I422) => w_mask_422_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
),
(BPC::BPC16, Rav1dPixelLayoutSubSampled::I444) => w_mask_444_16bpc_avx2_safe(
token,
&mut dst_bytes[dst_offset..],
dst_stride as usize,
tmp1,
tmp2,
w,
h,
mask,
sign,
bd_c,
),
}
true
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn put_8tap_8bpc_dispatch_inner(
token: Desktop64,
dst: &mut [u8],
dst_stride: isize,
src: &[u8],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
h_filter: Rav1dFilterMode,
v_filter: Rav1dFilterMode,
) {
if let Some(t512) = crate::src::cpu::summon_avx512() {
put_8tap_8bpc_avx512_impl_inner(
t512, dst, dst_stride, src, src_base, src_stride, w, h, mx, my, h_filter, v_filter,
);
return;
}
put_8tap_8bpc_avx2_impl_inner(
token, dst, dst_stride, src, src_base, src_stride, w, h, mx, my, h_filter, v_filter,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn put_bilin_8bpc_dispatch_inner(
token: Desktop64,
dst: &mut [u8],
dst_stride: isize,
src: &[u8],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
if w >= 64 {
if let Some(t512) = crate::src::cpu::summon_avx512() {
put_bilin_8bpc_avx512_impl_inner(t512, dst, dst_stride, src, src_stride, w, h, mx, my);
return;
}
}
put_bilin_8bpc_avx2_impl_inner(token, dst, dst_stride, src, src_stride, w, h, mx, my);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn put_8tap_16bpc_dispatch_inner(
token: Desktop64,
dst: &mut [u16],
dst_stride: isize,
src: &[u16],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bd_c: i32,
h_filter: Rav1dFilterMode,
v_filter: Rav1dFilterMode,
) {
if let Some(t512) = crate::src::cpu::summon_avx512() {
put_8tap_16bpc_avx512_impl_inner(
t512, dst, dst_stride, src, src_base, src_stride, w, h, mx, my, bd_c, h_filter,
v_filter,
);
return;
}
put_8tap_16bpc_avx2_impl_inner(
token, dst, dst_stride, src, src_base, src_stride, w, h, mx, my, bd_c, h_filter, v_filter,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn prep_8tap_8bpc_dispatch_inner(
token: Desktop64,
tmp: &mut [i16],
src: &[u8],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
h_filter: Rav1dFilterMode,
v_filter: Rav1dFilterMode,
) {
if let Some(t512) = crate::src::cpu::summon_avx512() {
prep_8tap_8bpc_avx512_impl_inner(
t512, tmp, src, src_base, src_stride, w, h, mx, my, h_filter, v_filter,
);
return;
}
prep_8tap_8bpc_avx2_impl_inner(
token, tmp, src, src_base, src_stride, w, h, mx, my, h_filter, v_filter,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn prep_bilin_8bpc_dispatch_inner(
token: Desktop64,
tmp: &mut [i16],
src: &[u8],
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
) {
if w >= 64 {
if let Some(t512) = crate::src::cpu::summon_avx512() {
prep_bilin_8bpc_avx512_impl_inner(t512, tmp, src, src_stride, w, h, mx, my);
return;
}
}
prep_bilin_8bpc_avx2_impl_inner(token, tmp, src, src_stride, w, h, mx, my);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn prep_8tap_16bpc_dispatch_inner(
token: Desktop64,
tmp: &mut [i16],
src: &[u16],
src_base: usize,
src_stride: isize,
w: i32,
h: i32,
mx: i32,
my: i32,
bitdepth_max: i32,
h_filter: Rav1dFilterMode,
v_filter: Rav1dFilterMode,
) {
if let Some(t512) = crate::src::cpu::summon_avx512() {
prep_8tap_16bpc_avx512_impl_inner(
t512,
tmp,
src,
src_base,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
h_filter,
v_filter,
);
return;
}
prep_8tap_16bpc_avx2_impl_inner(
token,
tmp,
src,
src_base,
src_stride,
w,
h,
mx,
my,
bitdepth_max,
h_filter,
v_filter,
);
}
#[cfg(target_arch = "x86_64")]
pub fn mc_put_dispatch<BD: BitDepth>(
filter: Filter2d,
dst: PicOffset,
src: PicOffset,
w: i32,
h: i32,
mx: i32,
my: i32,
bd: BD,
) -> bool {
let Some(_token) = crate::src::cpu::summon_avx2() else {
return false;
};
if dst.data.ref_eq(src.data) {
return false;
}
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w as usize,
h as usize,
|dst_bytes, dst_offset, dst_stride| {
mc_put_dispatch_inner::<BD>(
filter, dst_bytes, dst_offset, dst_stride, src, w, h, mx, my, bd,
);
},
);
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn mc_put_dispatch_inner<BD: BitDepth>(
filter: Filter2d,
dst_bytes: &mut [u8],
dst_offset: usize,
dst_stride: isize,
src: PicOffset,
w: i32,
h: i32,
mx: i32,
my: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
use zerocopy::IntoBytes;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let src_stride = src.stride();
let pixel_size = std::mem::size_of::<BD::Pixel>();
match BD::BPC {
BPC::BPC8 => {
let (src_guard, src_base) = src.full_guard::<BD>();
match filter {
Filter2d::Bilinear => {
let src_bytes = &src_guard.as_bytes()[src_base * pixel_size..];
put_bilin_8bpc_dispatch_inner(
token,
&mut dst_bytes[dst_offset..],
dst_stride,
src_bytes,
src_stride,
w,
h,
mx,
my,
);
}
_ => {
let src_bytes = src_guard.as_bytes();
let (h_filter, v_filter) = filter.hv();
put_8tap_8bpc_dispatch_inner(
token,
&mut dst_bytes[dst_offset..],
dst_stride,
src_bytes,
src_base * pixel_size,
src_stride,
w,
h,
mx,
my,
h_filter,
v_filter,
);
}
}
}
BPC::BPC16 => {
let dst_u16: &mut [u16] =
zerocopy::Ref::<_, [u16]>::new_slice(&mut dst_bytes[dst_offset..])
.expect("u16 alignment")
.into_mut_slice();
let (src_guard, src_base) = src.full_guard::<BD>();
let bd_c = bd.into_c();
match filter {
Filter2d::Bilinear => {
let src_bytes = &src_guard.as_bytes()[src_base * pixel_size..];
let src_u16_bilin: &[u16] = zerocopy::Ref::<_, [u16]>::new_slice(src_bytes)
.expect("u16 alignment")
.into_slice();
if let Some(t512) = crate::src::cpu::summon_avx512() {
put_bilin_16bpc_avx512_impl_inner(
t512,
dst_u16,
dst_stride / 2,
src_u16_bilin,
src_stride / 2,
w,
h,
mx,
my,
bd_c,
);
} else {
put_bilin_16bpc_avx2_impl_inner_safe(
token,
dst_u16,
dst_stride / 2,
src_u16_bilin,
src_stride / 2,
w,
h,
mx,
my,
bd_c,
);
}
}
_ => {
let src_all_bytes = src_guard.as_bytes();
let src_u16: &[u16] = zerocopy::Ref::<_, [u16]>::new_slice(src_all_bytes)
.expect("u16 alignment")
.into_slice();
let (h_filter, v_filter) = filter.hv();
put_8tap_16bpc_dispatch_inner(
token, dst_u16, dst_stride, src_u16, src_base, src_stride, w, h, mx, my,
bd_c, h_filter, v_filter,
);
}
}
}
}
true
}
#[cfg(target_arch = "x86_64")]
pub fn mct_prep_dispatch<BD: BitDepth>(
filter: Filter2d,
tmp: &mut [i16],
src: PicOffset,
w: i32,
h: i32,
mx: i32,
my: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
use zerocopy::IntoBytes;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let src_stride = src.stride();
let pixel_size = std::mem::size_of::<BD::Pixel>();
match BD::BPC {
BPC::BPC8 => {
let (src_guard, src_base) = src.full_guard::<BD>();
match filter {
Filter2d::Bilinear => {
let src_bytes = &src_guard.as_bytes()[src_base * pixel_size..];
prep_bilin_8bpc_dispatch_inner(token, tmp, src_bytes, src_stride, w, h, mx, my);
}
_ => {
let src_bytes = src_guard.as_bytes();
let (h_filter, v_filter) = filter.hv();
prep_8tap_8bpc_dispatch_inner(
token,
tmp,
src_bytes,
src_base * pixel_size,
src_stride,
w,
h,
mx,
my,
h_filter,
v_filter,
);
}
}
}
BPC::BPC16 => {
let (src_guard, src_base) = src.full_guard::<BD>();
let bd_c = bd.into_c();
match filter {
Filter2d::Bilinear => {
let src_bytes = &src_guard.as_bytes()[src_base * pixel_size..];
let src_u16_bilin: &[u16] = zerocopy::Ref::<_, [u16]>::new_slice(src_bytes)
.expect("u16 alignment")
.into_slice();
if let Some(t512) = crate::src::cpu::summon_avx512() {
prep_bilin_16bpc_avx512_impl_inner(
t512,
tmp,
src_u16_bilin,
src_stride / 2,
w,
h,
mx,
my,
bd_c,
);
} else {
prep_bilin_16bpc_avx2_impl_inner_safe(
token,
tmp,
src_u16_bilin,
src_stride / 2,
w,
h,
mx,
my,
bd_c,
);
}
}
_ => {
let src_all_bytes = src_guard.as_bytes();
let src_u16: &[u16] = zerocopy::Ref::<_, [u16]>::new_slice(src_all_bytes)
.expect("u16 alignment")
.into_slice();
let (h_filter, v_filter) = filter.hv();
prep_8tap_16bpc_dispatch_inner(
token, tmp, src_u16, src_base, src_stride, w, h, mx, my, bd_c, h_filter,
v_filter,
);
}
}
}
}
let _ = bd;
true
}
#[cfg(target_arch = "x86_64")]
pub fn mc_scaled_dispatch<BD: BitDepth>(
_filter: Filter2d,
_dst: PicOffset,
_src: PicOffset,
_w: i32,
_h: i32,
_mx: i32,
_my: i32,
_dx: i32,
_dy: i32,
_bd: BD,
) -> bool {
false
}
#[cfg(target_arch = "x86_64")]
pub fn mct_scaled_dispatch<BD: BitDepth>(
_filter: Filter2d,
_tmp: &mut [i16],
_src: PicOffset,
_w: i32,
_h: i32,
_mx: i32,
_my: i32,
_dx: i32,
_dy: i32,
_bd: BD,
) -> bool {
false
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn warp_h_dot8(_t: Desktop64, src: &[u8], off: usize, filter: &[i8; 8]) -> i32 {
let src_slice = &src[off..off + 8];
let mut s_arr = [0u8; 16];
s_arr[..8].copy_from_slice(src_slice);
let s8 = loadu_128!(&s_arr, [u8; 16]);
let s16 = _mm_unpacklo_epi8(s8, _mm_setzero_si128());
let mut f_i16 = [0i16; 8];
for i in 0..8 {
f_i16[i] = filter[i] as i16;
}
let f16 = loadu_128!(&f_i16, [i16; 8]);
let prod = _mm_madd_epi16(s16, f16);
let sum1 = _mm_hadd_epi32(prod, prod);
let sum2 = _mm_hadd_epi32(sum1, sum1);
_mm_cvtsi128_si32(sum2)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn warp_h_pass_8bpc(
_t: Desktop64,
mid: &mut [[i16; 8]; 15],
src: &[u8],
src_base: usize,
src_stride: isize,
alpha: i32,
beta: i32,
mx: i32,
) {
use crate::src::tables::dav1d_mc_warp_filter;
let round_h = (1i32 << (7 - 4)) >> 1;
let shift_h = 7 - 4;
for y in 0..15usize {
let row_base = (src_base as isize + (y as isize - 3) * src_stride - 3) as usize;
let mut tmx = mx + (y as i32) * beta;
for x in 0..8usize {
let fidx = (64 + ((tmx + 512) >> 10)) as usize;
tmx += alpha;
let filter = &dav1d_mc_warp_filter[fidx];
let off = row_base + x;
let dot = warp_h_dot8(_t, src, off, filter);
mid[y][x] = ((dot + round_h) >> shift_h) as i16;
}
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn warp_v_pass_8bpc_put(
_t: Desktop64,
dst: &mut [u8],
dst_stride: isize,
mid: &[[i16; 8]; 15],
gamma: i32,
delta: i32,
my: i32,
) {
use crate::src::tables::dav1d_mc_warp_filter;
let round_v = (1i32 << (7 + 4)) >> 1;
let shift_v = 7 + 4;
for y in 0..8usize {
let dst_off = (y as isize * dst_stride) as usize;
let mut tmy = my + (y as i32) * delta;
for x in 0..8usize {
let fidx = (64 + ((tmy + 512) >> 10)) as usize;
tmy += gamma;
let filter = &dav1d_mc_warp_filter[fidx];
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[y + i][x] as i32;
}
dst[dst_off + x] = ((sum + round_v) >> shift_v).clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn warp_v_pass_8bpc_prep(
_t: Desktop64,
tmp: &mut [i16],
tmp_stride: usize,
mid: &[[i16; 8]; 15],
gamma: i32,
delta: i32,
my: i32,
) {
use crate::src::tables::dav1d_mc_warp_filter;
let round_v = (1i32 << 7) >> 1;
let shift_v = 7;
for y in 0..8usize {
let mut tmy = my + (y as i32) * delta;
for x in 0..8usize {
let fidx = (64 + ((tmy + 512) >> 10)) as usize;
tmy += gamma;
let filter = &dav1d_mc_warp_filter[fidx];
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[y + i][x] as i32;
}
tmp[y * tmp_stride + x] = ((sum + round_v) >> shift_v) as i16;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn warp_affine_8x8_8bpc_avx2(
_t: Desktop64,
dst: &mut [u8],
dst_stride: isize,
src: &[u8],
src_base: usize,
src_stride: isize,
abcd: &[i16; 4],
mx: i32,
my: i32,
) {
let mut mid = [[0i16; 8]; 15];
warp_h_pass_8bpc(
_t,
&mut mid,
src,
src_base,
src_stride,
abcd[0] as i32,
abcd[1] as i32,
mx,
);
warp_v_pass_8bpc_put(
_t,
dst,
dst_stride,
&mid,
abcd[2] as i32,
abcd[3] as i32,
my,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn warp_affine_8x8t_8bpc_avx2(
_t: Desktop64,
tmp: &mut [i16],
tmp_stride: usize,
src: &[u8],
src_base: usize,
src_stride: isize,
abcd: &[i16; 4],
mx: i32,
my: i32,
) {
let mut mid = [[0i16; 8]; 15];
warp_h_pass_8bpc(
_t,
&mut mid,
src,
src_base,
src_stride,
abcd[0] as i32,
abcd[1] as i32,
mx,
);
warp_v_pass_8bpc_prep(
_t,
tmp,
tmp_stride,
&mid,
abcd[2] as i32,
abcd[3] as i32,
my,
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn warp_h_dot16(_t: Desktop64, src: &[u8], off: usize, filter: &[i8; 8]) -> i32 {
let s16 = loadu_128!(&src[off..off + 16], [u8; 16]);
let mut f_i16 = [0i16; 8];
for i in 0..8 {
f_i16[i] = filter[i] as i16;
}
let f16 = loadu_128!(&f_i16, [i16; 8]);
let prod = _mm_madd_epi16(s16, f16);
let sum1 = _mm_hadd_epi32(prod, prod);
let sum2 = _mm_hadd_epi32(sum1, sum1);
_mm_cvtsi128_si32(sum2)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn warp_h_pass_16bpc(
_t: Desktop64,
mid: &mut [[i16; 8]; 15],
src: &[u8],
src_base: usize,
src_stride: isize,
alpha: i32,
beta: i32,
mx: i32,
intermediate_bits: u8,
) {
use crate::src::tables::dav1d_mc_warp_filter;
let round_h = (1i32 << (7 - intermediate_bits)) >> 1;
let shift_h = 7 - intermediate_bits;
for y in 0..15usize {
let row_base = (src_base as isize + (y as isize - 3) * src_stride - 6) as usize;
let mut tmx = mx + (y as i32) * beta;
for x in 0..8usize {
let fidx = (64 + ((tmx + 512) >> 10)) as usize;
tmx += alpha;
let filter = &dav1d_mc_warp_filter[fidx];
let off = row_base + x * 2;
let dot = warp_h_dot16(_t, src, off, filter);
mid[y][x] = ((dot + round_h) >> shift_h) as i16;
}
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn warp_v_pass_16bpc_put(
_t: Desktop64,
dst: &mut [u8],
dst_stride: isize,
mid: &[[i16; 8]; 15],
gamma: i32,
delta: i32,
my: i32,
intermediate_bits: u8,
bitdepth_max: i32,
) {
use crate::src::tables::dav1d_mc_warp_filter;
let round_v = (1i32 << (7 + intermediate_bits)) >> 1;
let shift_v = 7 + intermediate_bits;
for y in 0..8usize {
let dst_off = (y as isize * dst_stride) as usize;
let mut tmy = my + (y as i32) * delta;
for x in 0..8usize {
let fidx = (64 + ((tmy + 512) >> 10)) as usize;
tmy += gamma;
let filter = &dav1d_mc_warp_filter[fidx];
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[y + i][x] as i32;
}
let val = ((sum + round_v) >> shift_v).clamp(0, bitdepth_max) as u16;
let bytes = val.to_le_bytes();
dst[dst_off + x * 2] = bytes[0];
dst[dst_off + x * 2 + 1] = bytes[1];
}
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn warp_v_pass_16bpc_prep(
_t: Desktop64,
tmp: &mut [i16],
tmp_stride: usize,
mid: &[[i16; 8]; 15],
gamma: i32,
delta: i32,
my: i32,
) {
use crate::src::tables::dav1d_mc_warp_filter;
let round_v = (1i32 << 7) >> 1;
let shift_v = 7;
for y in 0..8usize {
let mut tmy = my + (y as i32) * delta;
for x in 0..8usize {
let fidx = (64 + ((tmy + 512) >> 10)) as usize;
tmy += gamma;
let filter = &dav1d_mc_warp_filter[fidx];
let mut sum = 0i32;
for i in 0..8 {
sum += filter[i] as i32 * mid[y + i][x] as i32;
}
tmp[y * tmp_stride + x] = (((sum + round_v) >> shift_v) - 8192) as i16;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn warp_affine_8x8_16bpc_avx2(
_t: Desktop64,
dst: &mut [u8],
dst_stride: isize,
src: &[u8],
src_base: usize,
src_stride: isize,
abcd: &[i16; 4],
mx: i32,
my: i32,
intermediate_bits: u8,
bitdepth_max: i32,
) {
let mut mid = [[0i16; 8]; 15];
warp_h_pass_16bpc(
_t,
&mut mid,
src,
src_base,
src_stride,
abcd[0] as i32,
abcd[1] as i32,
mx,
intermediate_bits,
);
warp_v_pass_16bpc_put(
_t,
dst,
dst_stride,
&mid,
abcd[2] as i32,
abcd[3] as i32,
my,
intermediate_bits,
bitdepth_max,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn warp_affine_8x8t_16bpc_avx2(
_t: Desktop64,
tmp: &mut [i16],
tmp_stride: usize,
src: &[u8],
src_base: usize,
src_stride: isize,
abcd: &[i16; 4],
mx: i32,
my: i32,
intermediate_bits: u8,
) {
let mut mid = [[0i16; 8]; 15];
warp_h_pass_16bpc(
_t,
&mut mid,
src,
src_base,
src_stride,
abcd[0] as i32,
abcd[1] as i32,
mx,
intermediate_bits,
);
warp_v_pass_16bpc_prep(
_t,
tmp,
tmp_stride,
&mid,
abcd[2] as i32,
abcd[3] as i32,
my,
);
}
#[cfg(target_arch = "x86_64")]
pub fn warp8x8_dispatch<BD: BitDepth>(
dst: PicOffset,
src: PicOffset,
abcd: &[i16; 4],
mx: i32,
my: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::{AsPrimitive, BPC};
use zerocopy::IntoBytes;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
if dst.data.ref_eq(src.data) {
return false;
}
let src_stride = src.stride();
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (src_guard, src_base) = src.full_guard::<BD>();
let src_bytes = src_guard.as_bytes();
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
8,
8,
|dst_bytes, dst_offset, dst_stride| match BD::BPC {
BPC::BPC8 => {
warp_affine_8x8_8bpc_avx2(
token,
&mut dst_bytes[dst_offset..],
dst_stride,
src_bytes,
src_base * pixel_size,
src_stride,
abcd,
mx,
my,
);
}
BPC::BPC16 => {
warp_affine_8x8_16bpc_avx2(
token,
&mut dst_bytes[dst_offset..],
dst_stride,
src_bytes,
src_base * pixel_size,
src_stride,
abcd,
mx,
my,
bd.get_intermediate_bits(),
bd.bitdepth_max().as_::<i32>(),
);
}
},
);
true
}
#[cfg(target_arch = "x86_64")]
pub fn warp8x8t_dispatch<BD: BitDepth>(
tmp: &mut [i16],
tmp_stride: usize,
src: PicOffset,
abcd: &[i16; 4],
mx: i32,
my: i32,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
use zerocopy::IntoBytes;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let src_stride = src.stride();
let pixel_size = std::mem::size_of::<BD::Pixel>();
let (src_guard, src_base) = src.full_guard::<BD>();
let src_bytes = src_guard.as_bytes();
match BD::BPC {
BPC::BPC8 => {
warp_affine_8x8t_8bpc_avx2(
token,
tmp,
tmp_stride,
src_bytes,
src_base * pixel_size,
src_stride,
abcd,
mx,
my,
);
}
BPC::BPC16 => {
warp_affine_8x8t_16bpc_avx2(
token,
tmp,
tmp_stride,
src_bytes,
src_base * pixel_size,
src_stride,
abcd,
mx,
my,
bd.get_intermediate_bits(),
);
}
}
true
}
#[cfg(target_arch = "x86_64")]
pub fn emu_edge_dispatch<BD: BitDepth>(
_bw: isize,
_bh: isize,
_iw: isize,
_ih: isize,
_x: isize,
_y: isize,
_dst: &mut [BD::Pixel; crate::src::internal::EMU_EDGE_LEN],
_dst_pxstride: usize,
_src: &crate::include::dav1d::picture::Rav1dPictureDataComponent,
) -> bool {
false
}
#[cfg(target_arch = "x86_64")]
pub fn resize_dispatch<BD: BitDepth>(
_dst: crate::src::with_offset::WithOffset<
crate::src::pic_or_buf::PicOrBuf<crate::src::align::AlignedVec64<u8>>,
>,
_src: PicOffset,
_dst_w: usize,
_h: usize,
_src_w: usize,
_dx: i32,
_mx: i32,
_bd: BD,
) -> bool {
false
}
#[cfg(all(test, target_arch = "x86_64"))]
mod tests {
use super::*;
#[test]
fn test_avg_8bpc_avx2_matches_scalar() {
let Some(token) = crate::src::cpu::summon_avx2() else {
eprintln!("Skipping AVX2 test - CPU doesn't support it or tokens disabled");
return;
};
let test_values: Vec<i16> = vec![
0,
1,
2,
127,
128,
255,
256,
511,
512,
1023,
1024,
-1,
-128,
-256,
-512,
-1024,
i16::MIN,
i16::MAX,
];
let w = 64i32;
let h = 2i32;
let mut tmp1 = [0i16; COMPINTER_LEN];
let mut tmp2 = [0i16; COMPINTER_LEN];
let mut dst_avx2 = vec![0u8; (w * h) as usize];
let mut dst_scalar = vec![0u8; (w * h) as usize];
for &v1 in &test_values {
for &v2 in &test_values {
for i in 0..(w * h) as usize {
tmp1[i] = v1;
tmp2[i] = v2;
}
dst_avx2.fill(0);
dst_scalar.fill(0);
avg_8bpc_avx2_safe(token, &mut dst_scalar, w as usize, &tmp1, &tmp2, w, h);
avg_8bpc_avx2_safe(token, &mut dst_avx2, w as usize, &tmp1, &tmp2, w, h);
assert_eq!(
dst_avx2,
dst_scalar,
"Mismatch for v1={}, v2={}: avx2={:?} scalar={:?}",
v1,
v2,
&dst_avx2[..8],
&dst_scalar[..8]
);
}
}
}
#[test]
fn test_avg_varying_data() {
let Some(token) = crate::src::cpu::summon_avx2() else {
return;
};
let w = 128i32;
let h = 4i32;
let mut tmp1 = [0i16; COMPINTER_LEN];
let mut tmp2 = [0i16; COMPINTER_LEN];
for i in 0..(w * h) as usize {
tmp1[i] = ((i * 37) % 8192) as i16;
tmp2[i] = ((i * 73 + 1000) % 8192) as i16;
}
let mut dst_a = vec![0u8; (w * h) as usize];
let mut dst_b = vec![0u8; (w * h) as usize];
avg_8bpc_avx2_safe(token, &mut dst_a, w as usize, &tmp1, &tmp2, w, h);
avg_8bpc_avx2_safe(token, &mut dst_b, w as usize, &tmp1, &tmp2, w, h);
assert_eq!(dst_a, dst_b, "Results differ for varying data");
}
#[test]
fn test_w_avg_8bpc_avx2_matches_scalar() {
let Some(token) = crate::src::cpu::summon_avx2() else {
eprintln!("Skipping AVX2 test - CPU doesn't support it or tokens disabled");
return;
};
let test_values: Vec<i16> = vec![
0, 1, 127, 255, 512, 1023, 2047, 4095, 8191, -1, -128, -512, -1024,
];
let test_weights = [0, 1, 4, 8, 12, 15, 16];
let w = 64i32;
let h = 2i32;
let mut tmp1 = [0i16; COMPINTER_LEN];
let mut tmp2 = [0i16; COMPINTER_LEN];
let mut dst_a = vec![0u8; (w * h) as usize];
let mut dst_b = vec![0u8; (w * h) as usize];
for &weight in &test_weights {
for &v1 in &test_values {
for &v2 in &test_values {
for i in 0..(w * h) as usize {
tmp1[i] = v1;
tmp2[i] = v2;
}
dst_a.fill(0);
dst_b.fill(0);
w_avg_8bpc_avx2_safe(token, &mut dst_a, w as usize, &tmp1, &tmp2, w, h, weight);
w_avg_8bpc_avx2_safe(token, &mut dst_b, w as usize, &tmp1, &tmp2, w, h, weight);
assert_eq!(
dst_a,
dst_b,
"Mismatch for weight={}, v1={}, v2={}: a={:?} b={:?}",
weight,
v1,
v2,
&dst_a[..8],
&dst_b[..8]
);
}
}
}
}
#[test]
fn test_mask_8bpc_matches_scalar() {
let Some(token) = crate::src::cpu::summon_avx2() else {
eprintln!("Skipping AVX2 test - CPU doesn't support it or tokens disabled");
return;
};
let test_values: Vec<i16> = vec![0, 127, 255, 512, 1023, 4095, -128, -512];
let test_masks: Vec<u8> = vec![0, 1, 16, 32, 48, 63, 64];
let w = 64i32;
let h = 2i32;
let mut tmp1 = [0i16; COMPINTER_LEN];
let mut tmp2 = [0i16; COMPINTER_LEN];
let mut mask = vec![0u8; (w * h) as usize];
let mut dst_a = vec![0u8; (w * h) as usize];
let mut dst_b = vec![0u8; (w * h) as usize];
for &m in &test_masks {
for &v1 in &test_values {
for &v2 in &test_values {
for i in 0..(w * h) as usize {
tmp1[i] = v1;
tmp2[i] = v2;
mask[i] = m;
}
dst_a.fill(0);
dst_b.fill(0);
mask_8bpc_avx2_safe(token, &mut dst_a, w as usize, &tmp1, &tmp2, w, h, &mask);
mask_8bpc_avx2_safe(token, &mut dst_b, w as usize, &tmp1, &tmp2, w, h, &mask);
assert_eq!(
dst_a,
dst_b,
"Mismatch for mask={}, v1={}, v2={}: a={:?} b={:?}",
m,
v1,
v2,
&dst_a[..8],
&dst_b[..8]
);
}
}
}
}
#[test]
fn test_avg_token_permutations() {
use archmage::testing::{CompileTimePolicy, for_each_token_permutation};
let w = 32i32;
let h = 2i32;
let size = (w * h) as usize;
let mut tmp1 = [0i16; COMPINTER_LEN];
let mut tmp2 = [0i16; COMPINTER_LEN];
for i in 0..size {
tmp1[i] = ((i * 37) % 4096) as i16;
tmp2[i] = ((i * 73 + 500) % 4096) as i16;
}
let reference = {
let Some(token) = crate::src::cpu::summon_avx2() else {
eprintln!("Skipping: AVX2 not available");
return;
};
let mut dst = vec![0u8; size];
avg_8bpc_avx2_safe(token, &mut dst, w as usize, &tmp1, &tmp2, w, h);
dst
};
let report = for_each_token_permutation(CompileTimePolicy::WarnStderr, |perm| {
if let Some(token) = crate::src::cpu::summon_avx2() {
let mut dst = vec![0u8; size];
avg_8bpc_avx2_safe(token, &mut dst, w as usize, &tmp1, &tmp2, w, h);
assert_eq!(dst, reference, "avg output mismatch at: {perm}");
}
});
eprintln!("MC avg permutations: {}", report.permutations_run);
assert!(report.permutations_run >= 1);
}
}