#![allow(deprecated)] #![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#![allow(unused)]
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
use archmage::X64V4xToken;
use archmage::{Desktop64, Server64, SimdToken, arcane};
use std::ffi::c_int;
#[allow(non_camel_case_types)]
type ptrdiff_t = isize;
#[cfg(target_arch = "x86_64")]
use super::partial_simd;
#[cfg(target_arch = "x86_64")]
use crate::src::safe_simd::pixel_access::{
Flex, loadu_128, loadu_256, loadu_512, storeu_128, storeu_256, storeu_512,
};
use crate::include::common::bitdepth::DynPixel;
use crate::include::dav1d::picture::PicOffset;
use crate::src::ffi_safe::FFISafe;
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let fill_val = _mm256_set1_epi8(128u8 as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = 128;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_128_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
_topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
ipred_dc_128_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let fill_val = _mm512_set1_epi8(128u8 as i8);
let fill_256 = _mm256_set1_epi8(128u8 as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_val);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = 128;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let top_off = tl_off + 1;
match width {
4 => {
let top_val = _mm_cvtsi32_si128(i32::from_ne_bytes(
topleft[top_off..top_off + 4].try_into().unwrap(),
));
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
dst[row_off..row_off + 4]
.copy_from_slice(&_mm_cvtsi128_si32(top_val).to_ne_bytes());
}
}
8 => {
let top_val = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[top_off..top_off + 8]).try_into().unwrap(),
);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off..row_off + 8]).try_into().unwrap(),
top_val,
);
}
}
16 => {
let top_val = loadu_128!((&topleft[top_off..top_off + 16]), [u8; 16]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_128!((&mut dst[row_off..row_off + 16]), [u8; 16], top_val);
}
}
32 => {
let top_val = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val);
}
}
64 => {
let top_val = loadu_512!((&topleft[top_off..top_off + 64]), [u8; 64]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_512!((&mut dst[row_off..row_off + 64]), [u8; 64], top_val);
}
}
_ => {
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
dst[row_off..row_off + width].copy_from_slice(&topleft[top_off..top_off + width]);
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let top_off = tl_off + 1;
match width {
4 => {
let top_val = _mm_cvtsi32_si128(i32::from_ne_bytes(
topleft[top_off..top_off + 4].try_into().unwrap(),
));
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
dst[row_off..row_off + 4]
.copy_from_slice(&_mm_cvtsi128_si32(top_val).to_ne_bytes());
}
}
8 => {
let top_val = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[top_off..top_off + 8]).try_into().unwrap(),
);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off..row_off + 8]).try_into().unwrap(),
top_val,
);
}
}
16 => {
let top_val = loadu_128!((&topleft[top_off..top_off + 16]), [u8; 16]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_128!((&mut dst[row_off..row_off + 16]), [u8; 16], top_val);
}
}
32 => {
let top_val = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val);
}
}
64 => {
let top_val0 = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
let top_val1 = loadu_256!((&topleft[top_off + 32..top_off + 64]), [u8; 32]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val0);
storeu_256!((&mut dst[row_off + 32..row_off + 64]), [u8; 32], top_val1);
}
}
_ => {
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
dst[row_off..row_off + width].copy_from_slice(&topleft[top_off..top_off + width]);
}
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_v_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_v_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let left_pixel = topleft[tl_off - y - 1];
let fill_val = _mm256_set1_epi8(left_pixel as i8);
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = left_pixel;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_h_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_h_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let left_pixel = topleft[tl_off - y - 1];
let fill_512 = _mm512_set1_epi8(left_pixel as i8);
let fill_256 = _mm256_set1_epi8(left_pixel as i8);
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = left_pixel;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for x in 0..width {
sum += topleft[tl_off + 1 + x] as u32;
}
for y in 0..height {
sum += topleft[tl_off - y - 1] as u32;
}
let total = width + height;
let dc_val = ((sum + (total as u32 >> 1)) / total as u32) as u8;
let fill_512 = _mm512_set1_epi8(dc_val as i8);
let fill_256 = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for x in 0..width {
sum += topleft[tl_off + 1 + x] as u32;
}
let dc_val = ((sum + (width as u32 >> 1)) / width as u32) as u8;
let fill_512 = _mm512_set1_epi8(dc_val as i8);
let fill_256 = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for y in 0..height {
sum += topleft[tl_off - y - 1] as u32;
}
let dc_val = ((sum + (height as u32 >> 1)) / height as u32) as u8;
let fill_512 = _mm512_set1_epi8(dc_val as i8);
let fill_256 = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for x in 0..width {
sum += topleft[tl_off + 1 + x] as u32;
}
for y in 0..height {
sum += topleft[tl_off - y - 1] as u32;
}
let total = width + height;
let dc_val = ((sum + (total as u32 >> 1)) / total as u32) as u8;
let fill_val = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_dc_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for x in 0..width {
sum += topleft[tl_off + 1 + x] as u32;
}
let dc_val = ((sum + (width as u32 >> 1)) / width as u32) as u8;
let fill_val = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_top_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_dc_top_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for y in 0..height {
sum += topleft[tl_off - y - 1] as u32;
}
let dc_val = ((sum + (height as u32 >> 1)) / height as u32) as u8;
let fill_val = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_left_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_dc_left_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let topleft_val = topleft[tl_off] as i32;
let topleft_vec = _mm512_set1_epi32(topleft_val);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm512_set1_epi32(left_val);
let mut x = 0;
while x + 16 <= width {
let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
let top = _mm512_cvtepu8_epi32(top_bytes);
let base = _mm512_sub_epi32(_mm512_add_epi32(left_vec, top), topleft_vec);
let ldiff = _mm512_abs_epi32(_mm512_sub_epi32(left_vec, base));
let tdiff = _mm512_abs_epi32(_mm512_sub_epi32(top, base));
let tldiff = _mm512_abs_epi32(_mm512_sub_epi32(topleft_vec, base));
let ld_le_td = !_mm512_cmpgt_epi32_mask(ldiff, tdiff);
let ld_le_tld = !_mm512_cmpgt_epi32_mask(ldiff, tldiff);
let td_le_tld = !_mm512_cmpgt_epi32_mask(tdiff, tldiff);
let use_left = ld_le_td & ld_le_tld;
let use_top = !use_left & td_le_tld;
let result = _mm512_mask_blend_epi32(
use_left,
_mm512_mask_blend_epi32(use_top, topleft_vec, top),
left_vec,
);
let clamped = _mm512_max_epi32(result, _mm512_setzero_si512());
let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);
x += 16;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let base = left_val + top_val - topleft_val;
let ldiff = (left_val - base).abs();
let tdiff = (top_val - base).abs();
let tldiff = (topleft_val - base).abs();
let result = if ldiff <= tdiff && ldiff <= tldiff {
left_val
} else if tdiff <= tldiff {
top_val
} else {
topleft_val
};
row[x] = result as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let weights_ver = &dav1d_sm_weights[height..][..height];
let right_val = topleft[tl_off + width] as i32;
let bottom_val = topleft[tl_off - height] as i32;
let right_vec = _mm512_set1_epi32(right_val);
let bottom_vec = _mm512_set1_epi32(bottom_val);
let rounding = _mm512_set1_epi32(256);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm512_set1_epi32(left_val);
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm512_set1_epi32(w_v);
let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 16 <= width {
let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
let top = _mm512_cvtepu8_epi32(top_bytes);
let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
let w_h = _mm512_cvtepu8_epi32(wh_bytes);
let w_h_inv = _mm512_sub_epi32(c256, w_h);
let vert = _mm512_add_epi32(
_mm512_mullo_epi32(w_v_vec, top),
_mm512_mullo_epi32(w_v_inv, bottom_vec),
);
let hor = _mm512_add_epi32(
_mm512_mullo_epi32(w_h, left_vec),
_mm512_mullo_epi32(w_h_inv, right_vec),
);
let pred = _mm512_add_epi32(vert, hor);
let result = _mm512_srai_epi32::<9>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);
x += 16;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let w_h = weights_hor[x] as i32;
let pred =
w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
row[x] = ((pred + 256) >> 9) as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_ver = &dav1d_sm_weights[height..][..height];
let bottom_val = topleft[tl_off - height] as i32;
let bottom_vec = _mm512_set1_epi32(bottom_val);
let rounding = _mm512_set1_epi32(128);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm512_set1_epi32(w_v);
let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 16 <= width {
let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
let top = _mm512_cvtepu8_epi32(top_bytes);
let pred = _mm512_add_epi32(
_mm512_mullo_epi32(w_v_vec, top),
_mm512_mullo_epi32(w_v_inv, bottom_vec),
);
let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);
x += 16;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let pred = w_v * top_val + (256 - w_v) * bottom_val;
row[x] = ((pred + 128) >> 8) as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let right_val = topleft[tl_off + width] as i32;
let right_vec = _mm512_set1_epi32(right_val);
let rounding = _mm512_set1_epi32(128);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm512_set1_epi32(left_val);
let mut x = 0;
while x + 16 <= width {
let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
let w_h = _mm512_cvtepu8_epi32(wh_bytes);
let w_h_inv = _mm512_sub_epi32(c256, w_h);
let pred = _mm512_add_epi32(
_mm512_mullo_epi32(w_h, left_vec),
_mm512_mullo_epi32(w_h_inv, right_vec),
);
let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);
x += 16;
}
let row = &mut dst[row_off..][..width];
while x < width {
let w_h = weights_hor[x] as i32;
let pred = w_h * left_val + (256 - w_h) * right_val;
row[x] = ((pred + 128) >> 8) as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let topleft_val = topleft[tl_off] as i32;
let topleft_vec = _mm256_set1_epi32(topleft_val);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm256_set1_epi32(left_val);
let mut x = 0;
while x + 8 <= width {
let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
.try_into()
.unwrap(),
);
let top_lo = _mm256_cvtepu8_epi32(top_bytes);
let base = _mm256_sub_epi32(_mm256_add_epi32(left_vec, top_lo), topleft_vec);
let ldiff = _mm256_abs_epi32(_mm256_sub_epi32(left_vec, base));
let tdiff = _mm256_abs_epi32(_mm256_sub_epi32(top_lo, base));
let tldiff = _mm256_abs_epi32(_mm256_sub_epi32(topleft_vec, base));
let ld_le_td = _mm256_or_si256(
_mm256_cmpgt_epi32(tdiff, ldiff),
_mm256_cmpeq_epi32(ldiff, tdiff),
);
let ld_le_tld = _mm256_or_si256(
_mm256_cmpgt_epi32(tldiff, ldiff),
_mm256_cmpeq_epi32(ldiff, tldiff),
);
let td_le_tld = _mm256_or_si256(
_mm256_cmpgt_epi32(tldiff, tdiff),
_mm256_cmpeq_epi32(tdiff, tldiff),
);
let use_left = _mm256_and_si256(ld_le_td, ld_le_tld);
let use_top = _mm256_andnot_si256(use_left, td_le_tld);
let result = _mm256_blendv_epi8(
_mm256_blendv_epi8(topleft_vec, top_lo, use_top),
left_vec,
use_left,
);
let packed = _mm256_shuffle_epi8(
result,
_mm256_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
),
);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi32(lo, hi);
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
combined,
);
x += 8;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let base = left_val + top_val - topleft_val;
let ldiff = (left_val - base).abs();
let tdiff = (top_val - base).abs();
let tldiff = (topleft_val - base).abs();
let result = if ldiff <= tdiff && ldiff <= tldiff {
left_val
} else if tdiff <= tldiff {
top_val
} else {
topleft_val
};
row[x] = result as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_paeth_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_paeth_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
use crate::src::tables::dav1d_sm_weights;
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let weights_ver = &dav1d_sm_weights[height..][..height];
let right_val = topleft[tl_off + width] as i32;
let bottom_val = topleft[tl_off - height] as i32;
let right_vec = _mm256_set1_epi32(right_val);
let bottom_vec = _mm256_set1_epi32(bottom_val);
let rounding = _mm256_set1_epi32(256);
let c256 = _mm256_set1_epi32(256);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm256_set1_epi32(left_val);
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm256_set1_epi32(w_v);
let w_v_inv = _mm256_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 8 <= width {
let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
.try_into()
.unwrap(),
);
let top = _mm256_cvtepu8_epi32(top_bytes);
let w_h_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&weights_hor[x..x + 8]).try_into().unwrap(),
);
let w_h = _mm256_cvtepu8_epi32(w_h_bytes);
let w_h_inv = _mm256_sub_epi32(c256, w_h);
let vert = _mm256_add_epi32(
_mm256_mullo_epi32(w_v_vec, top),
_mm256_mullo_epi32(w_v_inv, bottom_vec),
);
let hor = _mm256_add_epi32(
_mm256_mullo_epi32(w_h, left_vec),
_mm256_mullo_epi32(w_h_inv, right_vec),
);
let pred = _mm256_add_epi32(vert, hor);
let result = _mm256_srai_epi32::<9>(_mm256_add_epi32(pred, rounding));
let packed = _mm256_shuffle_epi8(
result,
_mm256_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
),
);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi32(lo, hi);
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
combined,
);
x += 8;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let w_h = weights_hor[x] as i32;
let pred =
w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
row[x] = ((pred + 256) >> 9) as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_smooth_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_ver = &dav1d_sm_weights[height..][..height];
let bottom_val = topleft[tl_off - height] as i32;
let bottom_vec = _mm256_set1_epi32(bottom_val);
let rounding = _mm256_set1_epi32(128);
let c256 = _mm256_set1_epi32(256);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm256_set1_epi32(w_v);
let w_v_inv = _mm256_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 8 <= width {
let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
.try_into()
.unwrap(),
);
let top = _mm256_cvtepu8_epi32(top_bytes);
let pred = _mm256_add_epi32(
_mm256_mullo_epi32(w_v_vec, top),
_mm256_mullo_epi32(w_v_inv, bottom_vec),
);
let result = _mm256_srai_epi32::<8>(_mm256_add_epi32(pred, rounding));
let packed = _mm256_shuffle_epi8(
result,
_mm256_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
),
);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi32(lo, hi);
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
combined,
);
x += 8;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let pred = w_v * top_val + (256 - w_v) * bottom_val;
row[x] = ((pred + 128) >> 8) as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_v_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_smooth_v_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let right_val = topleft[tl_off + width] as i32;
let right_vec = _mm256_set1_epi32(right_val);
let rounding = _mm256_set1_epi32(128);
let c256 = _mm256_set1_epi32(256);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm256_set1_epi32(left_val);
let mut x = 0;
while x + 8 <= width {
let w_h_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&weights_hor[x..x + 8]).try_into().unwrap(),
);
let w_h = _mm256_cvtepu8_epi32(w_h_bytes);
let w_h_inv = _mm256_sub_epi32(c256, w_h);
let pred = _mm256_add_epi32(
_mm256_mullo_epi32(w_h, left_vec),
_mm256_mullo_epi32(w_h_inv, right_vec),
);
let result = _mm256_srai_epi32::<8>(_mm256_add_epi32(pred, rounding));
let packed = _mm256_shuffle_epi8(
result,
_mm256_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
),
);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi32(lo, hi);
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
combined,
);
x += 8;
}
let row = &mut dst[row_off..][..width];
while x < width {
let w_h = weights_hor[x] as i32;
let pred = w_h * left_val + (256 - w_h) * right_val;
row[x] = ((pred + 128) >> 8) as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_h_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_smooth_h_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
use crate::src::tables::{FLT_INCR, dav1d_dr_intra_derivative, dav1d_filter_intra_taps, filter_fn};
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_filter_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
filt_idx: i32,
topleft_off: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let width = (width / 4) * 4; let filt_idx = (filt_idx as usize) & 511;
let filter = &dav1d_filter_intra_taps[filt_idx];
for y in (0..height).step_by(2) {
let cur_tl_off = topleft_off - y;
let mut tl_pixel = topleft[tl_off.wrapping_add(cur_tl_off)] as i32;
let row0_off = (dst_base as isize + y as isize * stride) as usize;
let row1_off = (dst_base as isize + (y + 1) as isize * stride) as usize;
for x in (0..width).step_by(4) {
let (p1, p2, p3, p4) = if y == 0 {
let top_base = tl_off.wrapping_add(topleft_off + 1 + x);
(
topleft[top_base] as i32,
topleft[top_base + 1] as i32,
topleft[top_base + 2] as i32,
topleft[top_base + 3] as i32,
)
} else {
let top_row = (dst_base as isize + (y as isize - 1) * stride) as usize;
(
dst[top_row + x] as i32,
dst[top_row + x + 1] as i32,
dst[top_row + x + 2] as i32,
dst[top_row + x + 3] as i32,
)
};
let (p5, p6) = if x == 0 {
let left_base = tl_off.wrapping_add(cur_tl_off.wrapping_sub(1));
(
topleft[left_base] as i32,
topleft[left_base.wrapping_sub(1)] as i32,
)
} else {
(dst[row0_off + x - 1] as i32, dst[row1_off + x - 1] as i32)
};
let p0 = tl_pixel;
let p = [p0, p1, p2, p3, p4, p5, p6];
let flt = filter.as_slice();
let mut flt_offset = 0;
for xx in 0..4 {
let acc = filter_fn(&flt[flt_offset..], p);
let val = ((acc + 8) >> 4).clamp(0, 255) as u8;
dst[row0_off + x + xx] = val;
flt_offset += FLT_INCR;
}
for xx in 0..4 {
let acc = filter_fn(&flt[flt_offset..], p);
let val = ((acc + 8) >> 4).clamp(0, 255) as u8;
dst[row1_off + x + xx] = val;
flt_offset += FLT_INCR;
}
tl_pixel = p4;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_filter_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
filt_idx: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_filter_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
filt_idx as i32,
topleft_off,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z1_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dx = dav1d_dr_intra_derivative[(angle >> 1) as usize] as i32;
let upsample_above = enable_intra_edge_filter
&& (90 - angle) < 40
&& ((width_i + height_i) as usize) <= (16 >> is_sm as usize);
let mut top_out = [0u8; 64 + 64];
let (top, max_base_x, base_inc);
if upsample_above {
upsample_edge_8bpc(
&mut top_out,
width_i + height_i,
topleft,
tl_off + 1,
-1,
width_i + std::cmp::min(width_i, height_i),
);
dx <<= 1;
top = top_out.as_slice();
max_base_x = (2 * (width_i + height_i) - 2) as usize;
base_inc = 2usize;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 90 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut top_out,
width_i + height_i,
0,
width_i + height_i,
topleft,
tl_off + 1,
-1,
width_i + std::cmp::min(width_i, height_i),
filter_strength,
);
top = top_out.as_slice();
max_base_x = (width_i + height_i - 1) as usize;
} else {
top = &topleft[tl_off + 1..];
max_base_x = width + std::cmp::min(width, height) - 1;
}
base_inc = 1;
};
let top = top.flex();
let rounding = _mm256_set1_epi16(32);
for y in 0..height_i {
let xpos = (y + 1) * dx;
let frac = (xpos & 0x3e) as i16;
let inv_frac = (64 - frac) as i16;
let frac_vec = _mm256_set1_epi16(frac);
let inv_frac_vec = _mm256_set1_epi16(inv_frac);
let row_off = (dst_base as isize + y as isize * stride) as usize;
let base0 = (xpos >> 6) as usize;
let mut x = 0usize;
if base_inc == 1 {
while x + 16 <= width && base0 + x + 16 < max_base_x {
let base = base0 + x;
let t0 = loadu_128!((&top[base..base + 16]), [u8; 16]);
let t1 = loadu_128!((&top[base + 1..base + 17]), [u8; 16]);
let t0_w = _mm256_cvtepu8_epi16(t0);
let t1_w = _mm256_cvtepu8_epi16(t1);
let prod0 = _mm256_mullo_epi16(t0_w, inv_frac_vec);
let prod1 = _mm256_mullo_epi16(t1_w, frac_vec);
let sum = _mm256_add_epi16(_mm256_add_epi16(prod0, prod1), rounding);
let result = _mm256_srai_epi16::<6>(sum);
let packed = _mm256_packus_epi16(result, result);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi64(lo, hi);
storeu_128!(
(&mut dst[row_off + x..row_off + x + 16]),
[u8; 16],
combined
);
x += 16;
}
}
while x < width {
let base = base0 + base_inc * x;
if base < max_base_x {
let t0 = top[base] as i32;
let t1 = top[base + 1] as i32;
let v = t0 * inv_frac as i32 + t1 * frac as i32;
dst[row_off + x] = ((v + 32) >> 6) as u8;
} else {
let fill_val = top[max_base_x];
for xx in x..width {
dst[row_off + xx] = fill_val;
}
break;
}
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z1_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_z1_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z1_8bpc_v4x_inner(
_token: X64V4xToken,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dx = dav1d_dr_intra_derivative[(angle >> 1) as usize] as i32;
let upsample_above = enable_intra_edge_filter
&& (90 - angle) < 40
&& ((width_i + height_i) as usize) <= (16 >> is_sm as usize);
let mut top_out = [0u8; 64 + 64];
let (top, max_base_x, base_inc);
if upsample_above {
upsample_edge_8bpc(
&mut top_out,
width_i + height_i,
topleft,
tl_off + 1,
-1,
width_i + std::cmp::min(width_i, height_i),
);
dx <<= 1;
top = top_out.as_slice();
max_base_x = (2 * (width_i + height_i) - 2) as usize;
base_inc = 2usize;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 90 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut top_out,
width_i + height_i,
0,
width_i + height_i,
topleft,
tl_off + 1,
-1,
width_i + std::cmp::min(width_i, height_i),
filter_strength,
);
top = top_out.as_slice();
max_base_x = (width_i + height_i - 1) as usize;
} else {
top = &topleft[tl_off + 1..];
max_base_x = width + std::cmp::min(width, height) - 1;
}
base_inc = 1;
};
let edge_len = (max_base_x + 1).min(128);
let mut ebuf = [0u8; 128];
let top_f = top.flex();
for i in 0..edge_len {
ebuf[i] = top_f[i];
}
let fill_val = top_f[max_base_x.min(127)];
for b in ebuf.iter_mut().skip(edge_len) {
*b = fill_val;
}
let edge_lo = loadu_512!((&ebuf[0..64]), [u8; 64]);
let edge_hi = loadu_512!((&ebuf[64..128]), [u8; 64]);
let max_idx8 = _mm512_set1_epi8((max_base_x.min(127)) as i8);
let rounding = _mm512_set1_epi16(32);
let lane_off: [u8; 64] = core::array::from_fn(|i| i as u8);
let lane_off_v = loadu_512!((&lane_off), [u8; 64]);
let one8 = _mm512_set1_epi8(1);
for y in 0..height_i {
let xpos = (y + 1) * dx;
let frac = (xpos & 0x3e) as i16;
let inv_frac = (64 - frac) as i16;
let frac_vec = _mm512_set1_epi16(frac);
let inv_frac_vec = _mm512_set1_epi16(inv_frac);
let row_off = (dst_base as isize + y as isize * stride) as usize;
let base0 = (xpos >> 6) as usize;
if base_inc == 1 {
let base0_v = _mm512_set1_epi8(base0.min(127) as i8);
let mut x = 0usize;
while x < width {
let xbase = _mm512_set1_epi8(x.min(127) as i8);
let idx0 = _mm512_adds_epu8(_mm512_adds_epu8(base0_v, xbase), lane_off_v);
let idx0 = _mm512_min_epu8(idx0, max_idx8);
let idx1 = _mm512_min_epu8(_mm512_adds_epu8(idx0, one8), max_idx8);
let t0 = _mm512_permutex2var_epi8(edge_lo, idx0, edge_hi);
let t1 = _mm512_permutex2var_epi8(edge_lo, idx1, edge_hi);
let t0_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(t0));
let t1_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(t1));
let p0 = _mm512_mullo_epi16(t0_lo, inv_frac_vec);
let p1 = _mm512_mullo_epi16(t1_lo, frac_vec);
let sblend = _mm512_add_epi16(_mm512_add_epi16(p0, p1), rounding);
let r = _mm512_srai_epi16::<6>(sblend);
let out32 = _mm512_cvtusepi16_epi8(r);
let n = (width - x).min(32);
let mut tmp = [0u8; 32];
storeu_256!((&mut tmp), [u8; 32], out32);
dst[row_off + x..row_off + x + n].copy_from_slice(&tmp[..n]);
x += 32;
}
} else {
let mut x = 0usize;
while x < width {
let base = base0 + base_inc * x;
if base < max_base_x {
let t0 = top_f[base] as i32;
let t1 = top_f[base + 1] as i32;
let v = t0 * inv_frac as i32 + t1 * frac as i32;
dst[row_off + x] = ((v + 32) >> 6) as u8;
} else {
let fv = top_f[max_base_x];
for xx in x..width {
dst[row_off + xx] = fv;
}
break;
}
x += 1;
}
}
}
}
#[inline]
fn get_filter_strength_simple(wh: i32, angle: i32, is_sm: bool) -> i32 {
if is_sm {
match (wh, angle) {
(..=8, 64..) => 2,
(..=8, 40..) => 1,
(..=8, ..) => 0,
(..=16, 48..) => 2,
(..=16, 20..) => 1,
(..=16, ..) => 0,
(..=24, 4..) => 3,
(..=24, ..) => 0,
(..) => 3,
}
} else {
match (wh, angle) {
(..=8, 56..) => 1,
(..=8, ..) => 0,
(..=16, 40..) => 1,
(..=16, ..) => 0,
(..=24, 32..) => 3,
(..=24, 16..) => 2,
(..=24, 8..) => 1,
(..=24, ..) => 0,
(..=32, 32..) => 3,
(..=32, 4..) => 2,
(..=32, ..) => 1,
(..) => 3,
}
}
}
fn filter_edge_8bpc(
out: &mut [u8],
sz: i32,
lim_from: i32,
lim_to: i32,
inp: &[u8],
in_off: usize,
from: i32,
to: i32,
strength: i32,
) {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let mut i = 0;
while i < std::cmp::min(sz, lim_from) {
out[i as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
i += 1;
}
while i < std::cmp::min(lim_to, sz) {
let mut s = 0i32;
for j in 0..5i32 {
s += inp[in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize)] as i32
* KERNEL[(strength - 1) as usize][j as usize] as i32;
}
out[i as usize] = ((s + 8) >> 4) as u8;
i += 1;
}
while i < sz {
out[i as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
i += 1;
}
}
fn upsample_edge_8bpc(out: &mut [u8], hsz: i32, inp: &[u8], in_off: usize, from: i32, to: i32) {
let kernel: [i8; 4] = [-1, 9, 9, -1];
for i in 0..hsz - 1 {
out[(i * 2) as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
let mut s = 0i32;
for j in 0..4i32 {
s += inp[in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize)] as i32
* kernel[j as usize] as i32;
}
out[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, 255) as u8;
}
let i = hsz - 1;
out[(i * 2) as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z2_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
max_width: i32,
max_height: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((angle - 90) >> 1) as usize] as i32;
let mut dx = dav1d_dr_intra_derivative[((180 - angle) >> 1) as usize] as i32;
let upsample_left = enable_intra_edge_filter
&& (180 - angle) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let upsample_above = enable_intra_edge_filter
&& (angle - 90) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut edge = [0u8; 64 + 64 + 1];
let edge_tl = 64usize;
if upsample_above {
upsample_edge_8bpc(
&mut edge[edge_tl..],
width_i + 1,
topleft,
tl_off,
0,
width_i + 1,
);
dx <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 90, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut edge[edge_tl + 1..],
width_i,
0,
max_width,
topleft,
tl_off + 1,
-1,
width_i,
filter_strength,
);
} else {
edge[edge_tl + 1..edge_tl + 1 + width]
.copy_from_slice(&topleft[tl_off + 1..tl_off + 1 + width]);
}
}
if upsample_left {
upsample_edge_8bpc(
&mut edge[edge_tl - height * 2..],
height_i + 1,
topleft,
tl_off.wrapping_sub(height),
0,
height_i + 1,
);
dy <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 180 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut edge[edge_tl - height..],
height_i,
height_i - max_height,
height_i,
topleft,
tl_off.wrapping_sub(height),
0,
height_i + 1,
filter_strength,
);
} else {
edge[edge_tl - height..edge_tl].copy_from_slice(&topleft[tl_off - height..tl_off]);
}
}
edge[edge_tl] = topleft[tl_off];
let edge = edge.as_slice().flex();
let base_inc_x = 1 + upsample_above as usize;
let left = edge_tl - (1 + upsample_left as usize);
let rounding = _mm256_set1_epi16(32);
for y in 0..height_i {
let xpos = ((1 + upsample_above as i32) << 6) - dx * (y + 1);
let base_x0 = xpos >> 6;
let frac_x = (xpos & 0x3e) as i16;
let inv_frac_x = (64 - frac_x) as i16;
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_count = if base_x0 >= 0 {
0usize
} else {
let needed = (-base_x0) as usize;
needed.div_ceil(base_inc_x).min(width)
};
let mut x = 0usize;
while x < left_count {
let ypos = (y << (6 + upsample_left as i32)) - dy * (x as i32 + 1);
let base_y = ypos >> 6;
let frac_y = ypos & 0x3e;
let inv_frac_y = 64 - frac_y;
let l0_idx = left.wrapping_add_signed(-base_y as isize);
let l1_idx = left.wrapping_add_signed(-(base_y + 1) as isize);
let l0 = edge[l0_idx] as i32;
let l1 = edge[l1_idx] as i32;
let v = l0 * inv_frac_y + l1 * frac_y;
dst[row_off + x] = ((v + 32) >> 6) as u8;
x += 1;
}
if base_inc_x == 1 {
while x + 16 <= width {
let base_x = (base_x0 + x as i32) as usize;
let idx = edge_tl + base_x;
if idx + 17 > edge.len() {
break;
}
let t0 = loadu_128!((&edge[idx..idx + 16]), [u8; 16]);
let t1 = loadu_128!((&edge[idx + 1..idx + 17]), [u8; 16]);
let t0_w = _mm256_cvtepu8_epi16(t0);
let t1_w = _mm256_cvtepu8_epi16(t1);
let frac_vec = _mm256_set1_epi16(frac_x);
let inv_frac_vec = _mm256_set1_epi16(inv_frac_x);
let prod0 = _mm256_mullo_epi16(t0_w, inv_frac_vec);
let prod1 = _mm256_mullo_epi16(t1_w, frac_vec);
let sum = _mm256_add_epi16(_mm256_add_epi16(prod0, prod1), rounding);
let result = _mm256_srai_epi16::<6>(sum);
let packed = _mm256_packus_epi16(result, result);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi64(lo, hi);
storeu_128!(
(&mut dst[row_off + x..row_off + x + 16]),
[u8; 16],
combined
);
x += 16;
}
}
while x < width {
let base_x = (base_x0 + (base_inc_x * x) as i32) as usize;
let idx = edge_tl + base_x;
if idx + 2 > edge.len() {
break;
}
let t0 = edge[idx] as i32;
let t1 = edge[idx + 1] as i32;
let v = t0 * inv_frac_x as i32 + t1 * frac_x as i32;
dst[row_off + x] = ((v + 32) >> 6) as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn ipred_z2_8bpc_v4x_inner(
_token: X64V4xToken,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
max_width: i32,
max_height: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((angle - 90) >> 1) as usize] as i32;
let mut dx = dav1d_dr_intra_derivative[((180 - angle) >> 1) as usize] as i32;
let upsample_left = enable_intra_edge_filter
&& (180 - angle) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let upsample_above = enable_intra_edge_filter
&& (angle - 90) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut edge = [0u8; 64 + 64 + 1];
let edge_tl = 64usize;
if upsample_above {
upsample_edge_8bpc(
&mut edge[edge_tl..],
width_i + 1,
topleft,
tl_off,
0,
width_i + 1,
);
dx <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 90, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut edge[edge_tl + 1..],
width_i,
0,
max_width,
topleft,
tl_off + 1,
-1,
width_i,
filter_strength,
);
} else {
edge[edge_tl + 1..edge_tl + 1 + width]
.copy_from_slice(&topleft[tl_off + 1..tl_off + 1 + width]);
}
}
if upsample_left {
upsample_edge_8bpc(
&mut edge[edge_tl - height * 2..],
height_i + 1,
topleft,
tl_off.wrapping_sub(height),
0,
height_i + 1,
);
dy <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 180 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut edge[edge_tl - height..],
height_i,
height_i - max_height,
height_i,
topleft,
tl_off.wrapping_sub(height),
0,
height_i + 1,
filter_strength,
);
} else {
edge[edge_tl - height..edge_tl].copy_from_slice(&topleft[tl_off - height..tl_off]);
}
}
edge[edge_tl] = topleft[tl_off];
let edge_len = edge.len();
let edge = edge.as_slice().flex();
let base_inc_x = 1 + upsample_above as usize;
let left = edge_tl - (1 + upsample_left as usize);
let top_k_max = edge_len - 1 - edge_tl; let mut tbuf = [0u8; 128];
for k in 0..=top_k_max {
tbuf[k] = edge[edge_tl + k];
}
let top_lo = loadu_512!((&tbuf[0..64]), [u8; 64]);
let top_hi = loadu_512!((&tbuf[64..128]), [u8; 64]);
let lane_off: [u8; 64] = core::array::from_fn(|i| i as u8);
let lane_off_v = loadu_512!((&lane_off), [u8; 64]);
let one8 = _mm512_set1_epi8(1);
let rounding512 = _mm512_set1_epi16(32);
for y in 0..height_i {
let xpos = ((1 + upsample_above as i32) << 6) - dx * (y + 1);
let base_x0 = xpos >> 6;
let frac_x = (xpos & 0x3e) as i16;
let inv_frac_x = (64 - frac_x) as i16;
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_count = if base_x0 >= 0 {
0usize
} else {
let needed = (-base_x0) as usize;
needed.div_ceil(base_inc_x).min(width)
};
let mut x = 0usize;
while x < left_count {
let ypos = (y << (6 + upsample_left as i32)) - dy * (x as i32 + 1);
let base_y = ypos >> 6;
let frac_y = ypos & 0x3e;
let inv_frac_y = 64 - frac_y;
let l0_idx = left.wrapping_add_signed(-base_y as isize);
let l1_idx = left.wrapping_add_signed(-(base_y + 1) as isize);
let l0 = edge[l0_idx] as i32;
let l1 = edge[l1_idx] as i32;
let v = l0 * inv_frac_y + l1 * frac_y;
dst[row_off + x] = ((v + 32) >> 6) as u8;
x += 1;
}
if base_inc_x == 1 {
let frac_vec = _mm512_set1_epi16(frac_x);
let inv_frac_vec = _mm512_set1_epi16(inv_frac_x);
while x + 32 <= width {
let base_x = base_x0 + x as i32; if (base_x as usize) + 31 + 1 > top_k_max {
break;
}
let k0 = _mm512_set1_epi8((base_x as usize).min(127) as i8);
let idx0 = _mm512_adds_epu8(k0, lane_off_v);
let idx1 = _mm512_adds_epu8(idx0, one8);
let t0 = _mm512_permutex2var_epi8(top_lo, idx0, top_hi);
let t1 = _mm512_permutex2var_epi8(top_lo, idx1, top_hi);
let t0_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(t0));
let t1_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(t1));
let p0 = _mm512_mullo_epi16(t0_lo, inv_frac_vec);
let p1 = _mm512_mullo_epi16(t1_lo, frac_vec);
let sblend = _mm512_add_epi16(_mm512_add_epi16(p0, p1), rounding512);
let r = _mm512_srai_epi16::<6>(sblend);
let out32 = _mm512_cvtusepi16_epi8(r);
let mut tmp = [0u8; 32];
storeu_256!((&mut tmp), [u8; 32], out32);
dst[row_off + x..row_off + x + 32].copy_from_slice(&tmp);
x += 32;
}
}
while x < width {
let base_x = (base_x0 + (base_inc_x * x) as i32) as usize;
let idx = edge_tl + base_x;
if idx + 2 > edge_len {
break;
}
let t0 = edge[idx] as i32;
let t1 = edge[idx + 1] as i32;
let v = t0 * inv_frac_x as i32 + t1 * frac_x as i32;
dst[row_off + x] = ((v + 32) >> 6) as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z2_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
max_width: c_int,
max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_z2_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
max_width as i32,
max_height as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z3_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((270 - angle) >> 1) as usize] as usize;
let upsample_left = enable_intra_edge_filter
&& (angle - 180) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut left_out = [0u8; 64 + 64];
let (left, left_off, max_base_y, base_inc);
if upsample_left {
upsample_edge_8bpc(
&mut left_out,
width_i + height_i,
topleft,
tl_off - (width + height),
std::cmp::max(width_i - height_i, 0),
width_i + height_i + 1,
);
left_off = (2 * (width_i + height_i) - 2) as usize;
max_base_y = left_off;
dy <<= 1;
base_inc = 2usize;
left = left_out.as_slice();
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 180, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut left_out,
width_i + height_i,
0,
width_i + height_i,
topleft,
tl_off - (width + height),
std::cmp::max(width_i - height_i, 0),
width_i + height_i + 1,
filter_strength,
);
left_off = (width_i + height_i - 1) as usize;
max_base_y = left_off;
left = left_out.as_slice();
} else {
left = topleft;
left_off = tl_off - 1;
max_base_y = height + std::cmp::min(width, height) - 1;
}
base_inc = 1;
};
let left = left.flex();
for x in 0..width {
let ypos = dy * (x + 1);
let frac = (ypos & 0x3e) as i32;
let inv_frac = 64 - frac;
for y in 0..height_i {
let base = (ypos >> 6) + base_inc * y as usize;
if base < max_base_y {
let l0 = left[left_off - base] as i32;
let l1 = left[left_off - base - 1] as i32;
let v = l0 * inv_frac + l1 * frac;
let pixel_off = (dst_base as isize + y as isize * stride) as usize + x;
dst[pixel_off] = ((v + 32) >> 6) as u8;
} else {
let fill_val = left[left_off - max_base_y];
for yy in y..height_i {
let pixel_off = (dst_base as isize + yy as isize * stride) as usize + x;
dst[pixel_off] = fill_val;
}
break;
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z3_8bpc_v4x_inner(
_token: X64V4xToken,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((270 - angle) >> 1) as usize] as usize;
let upsample_left = enable_intra_edge_filter
&& (angle - 180) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut left_out = [0u8; 64 + 64];
let (left, left_off, max_base_y, base_inc);
if upsample_left {
upsample_edge_8bpc(
&mut left_out,
width_i + height_i,
topleft,
tl_off - (width + height),
std::cmp::max(width_i - height_i, 0),
width_i + height_i + 1,
);
left_off = (2 * (width_i + height_i) - 2) as usize;
max_base_y = left_off;
dy <<= 1;
base_inc = 2usize;
left = left_out.as_slice();
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 180, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut left_out,
width_i + height_i,
0,
width_i + height_i,
topleft,
tl_off - (width + height),
std::cmp::max(width_i - height_i, 0),
width_i + height_i + 1,
filter_strength,
);
left_off = (width_i + height_i - 1) as usize;
max_base_y = left_off;
left = left_out.as_slice();
} else {
left = topleft;
left_off = tl_off - 1;
max_base_y = height + std::cmp::min(width, height) - 1;
}
base_inc = 1;
};
let left_f = left.flex();
let last = max_base_y.min(127);
let mut lbuf = [0u8; 128];
for k in 0..=last {
lbuf[k] = left_f[left_off - k];
}
let fill_val = lbuf[last];
for b in lbuf.iter_mut().skip(last + 1) {
*b = fill_val;
}
let edge_lo = loadu_512!((&lbuf[0..64]), [u8; 64]);
let edge_hi = loadu_512!((&lbuf[64..128]), [u8; 64]);
let max_idx8 = _mm512_set1_epi8(last as i8);
let rounding = _mm512_set1_epi16(32);
let lane_off: [u8; 64] = core::array::from_fn(|i| i as u8);
let lane_off_v = loadu_512!((&lane_off), [u8; 64]);
let one8 = _mm512_set1_epi8(1);
if base_inc == 1 {
for x in 0..width {
let ypos = dy * (x + 1);
let frac = (ypos & 0x3e) as i16;
let inv_frac = (64 - frac) as i16;
let frac_vec = _mm512_set1_epi16(frac);
let inv_frac_vec = _mm512_set1_epi16(inv_frac);
let base0 = ypos >> 6;
let base0_v = _mm512_set1_epi8(base0.min(127) as i8);
let mut y = 0usize;
while y < height {
let ybase = _mm512_set1_epi8(y.min(127) as i8);
let idx0 = _mm512_adds_epu8(_mm512_adds_epu8(base0_v, ybase), lane_off_v);
let idx0 = _mm512_min_epu8(idx0, max_idx8);
let idx1 = _mm512_min_epu8(_mm512_adds_epu8(idx0, one8), max_idx8);
let l0 = _mm512_permutex2var_epi8(edge_lo, idx0, edge_hi);
let l1 = _mm512_permutex2var_epi8(edge_lo, idx1, edge_hi);
let l0_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(l0));
let l1_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(l1));
let p0 = _mm512_mullo_epi16(l0_lo, inv_frac_vec);
let p1 = _mm512_mullo_epi16(l1_lo, frac_vec);
let sblend = _mm512_add_epi16(_mm512_add_epi16(p0, p1), rounding);
let r = _mm512_srai_epi16::<6>(sblend);
let out32 = _mm512_cvtusepi16_epi8(r);
let n = (height - y).min(32);
let mut tmp = [0u8; 32];
storeu_256!((&mut tmp), [u8; 32], out32);
for k in 0..n {
let off = (dst_base as isize + (y + k) as isize * stride) as usize + x;
dst[off] = tmp[k];
}
y += 32;
}
}
} else {
for x in 0..width {
let ypos = dy * (x + 1);
let frac = (ypos & 0x3e) as i32;
let inv_frac = 64 - frac;
for y in 0..height_i {
let base = (ypos >> 6) + base_inc * y as usize;
if base < max_base_y {
let l0 = left_f[left_off - base] as i32;
let l1 = left_f[left_off - base - 1] as i32;
let v = l0 * inv_frac + l1 * frac;
let pixel_off = (dst_base as isize + y as isize * stride) as usize + x;
dst[pixel_off] = ((v + 32) >> 6) as u8;
} else {
let fv = left_f[left_off - max_base_y];
for yy in y..height_i {
let pixel_off = (dst_base as isize + yy as isize * stride) as usize + x;
dst[pixel_off] = fv;
}
break;
}
}
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z3_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_z3_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
);
}
#[cfg(target_arch = "x86_64")]
fn compute_ipred_buf_len(stride: isize, width: usize, height: usize) -> usize {
height.saturating_sub(1) * stride.unsigned_abs() + width
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
unsafe fn compute_topleft_slice<'a>(
tl_ptr: *const u8,
width: usize,
height: usize,
) -> (&'a [u8], usize) {
let neg_reach = height + 2;
let pos_reach = width + height + 2;
let total = neg_reach + pos_reach;
let base = unsafe { tl_ptr.sub(neg_reach) };
(
unsafe { std::slice::from_raw_parts(base, total) },
neg_reach,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
width: usize,
height: usize,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mid_val = ((bitdepth_max + 1) / 2) as u16;
let fill_val = _mm256_set1_epi16(mid_val as i16);
let width_bytes = width * 2;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&mid_val.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_128_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
_topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
ipred_dc_128_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
width as usize,
height as usize,
bitdepth_max as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let top_off = tl_off + 2;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_256!((&topleft[load_off..load_off + 32]), [u8; 32]);
let store_off = row_off + x * 2;
storeu_256!((&mut dst[store_off..store_off + 32]), [u8; 32], top_vals);
x += 16;
}
while x + 8 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_128!((&topleft[load_off..load_off + 16]), [u8; 16]);
let store_off = row_off + x * 2;
storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], top_vals);
x += 8;
}
while x < width {
let load_off = top_off + x * 2;
let store_off = row_off + x * 2;
dst[store_off..store_off + 2].copy_from_slice(&topleft[load_off..load_off + 2]);
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_v_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_v_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
);
let fill_val = _mm256_set1_epi16(left_val as i16);
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&left_val.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_h_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_h_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
width: usize,
height: usize,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mid_val = ((bitdepth_max + 1) / 2) as u16;
let fill_512 = _mm512_set1_epi16(mid_val as i16);
let fill_256 = _mm256_set1_epi16(mid_val as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&mid_val.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let top_off = tl_off + 2;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_512!((&topleft[load_off..load_off + 64]), [u8; 64]);
let store_off = row_off + x * 2;
storeu_512!((&mut dst[store_off..store_off + 64]), [u8; 64], top_vals);
x += 32;
}
while x + 16 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_256!((&topleft[load_off..load_off + 32]), [u8; 32]);
let store_off = row_off + x * 2;
storeu_256!((&mut dst[store_off..store_off + 32]), [u8; 32], top_vals);
x += 16;
}
while x + 8 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_128!((&topleft[load_off..load_off + 16]), [u8; 16]);
let store_off = row_off + x * 2;
storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], top_vals);
x += 8;
}
while x < width {
let load_off = top_off + x * 2;
let store_off = row_off + x * 2;
dst[store_off..store_off + 2].copy_from_slice(&topleft[load_off..load_off + 2]);
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
);
let fill_512 = _mm512_set1_epi16(left_val as i16);
let fill_256 = _mm256_set1_epi16(left_val as i16);
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&left_val.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=width {
let off = tl_off + i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
for i in 1..=height {
let off = tl_off - i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let count = (width + height) as u32;
let avg = ((sum + count / 2) / count) as u16;
let fill_512 = _mm512_set1_epi16(avg as i16);
let fill_256 = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=width {
let off = tl_off + i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let avg = ((sum + width as u32 / 2) / width as u32) as u16;
let fill_512 = _mm512_set1_epi16(avg as i16);
let fill_256 = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=height {
let off = tl_off - i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let avg = ((sum + height as u32 / 2) / height as u32) as u16;
let fill_512 = _mm512_set1_epi16(avg as i16);
let fill_256 = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=width {
let off = tl_off + i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
for i in 1..=height {
let off = tl_off - i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let count = (width + height) as u32;
let avg = ((sum + count / 2) / count) as u16;
let fill_val = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_dc_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=width {
let off = tl_off + i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let avg = ((sum + width as u32 / 2) / width as u32) as u16;
let fill_val = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_top_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_dc_top_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=height {
let off = tl_off - i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let avg = ((sum + height as u32 / 2) / height as u32) as u16;
let fill_val = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_left_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_dc_left_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let topleft_val = u16::from_ne_bytes(topleft[tl_off..tl_off + 2].try_into().unwrap()) as i32;
let topleft_vec = _mm512_set1_epi32(topleft_val);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
let left_vec = _mm512_set1_epi32(left_val);
let mut x = 0;
while x + 16 <= width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
let top = _mm512_cvtepu16_epi32(top_u16);
let base = _mm512_sub_epi32(_mm512_add_epi32(left_vec, top), topleft_vec);
let ldiff = _mm512_abs_epi32(_mm512_sub_epi32(left_vec, base));
let tdiff = _mm512_abs_epi32(_mm512_sub_epi32(top, base));
let tldiff = _mm512_abs_epi32(_mm512_sub_epi32(topleft_vec, base));
let ld_le_td = !_mm512_cmpgt_epi32_mask(ldiff, tdiff);
let ld_le_tld = !_mm512_cmpgt_epi32_mask(ldiff, tldiff);
let td_le_tld = !_mm512_cmpgt_epi32_mask(tdiff, tldiff);
let use_left = ld_le_td & ld_le_tld;
let use_top = !use_left & td_le_tld;
let result = _mm512_mask_blend_epi32(
use_left,
_mm512_mask_blend_epi32(use_top, topleft_vec, top),
left_vec,
);
let clamped = _mm512_max_epi32(result, _mm512_setzero_si512());
let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
let off = row_off + x * 2;
storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);
x += 16;
}
while x < width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let base = left_val + top_val - topleft_val;
let l_diff = (left_val - base).abs();
let t_diff = (top_val - base).abs();
let tl_diff = (topleft_val - base).abs();
let pred = if l_diff <= t_diff && l_diff <= tl_diff {
left_val
} else if t_diff <= tl_diff {
top_val
} else {
topleft_val
};
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let weights_ver = &dav1d_sm_weights[height..][..height];
let right_off = tl_off + width * 2;
let right_val =
u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
let bottom_off = tl_off - height * 2;
let bottom_val =
u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
let right_vec = _mm512_set1_epi32(right_val);
let bottom_vec = _mm512_set1_epi32(bottom_val);
let rounding = _mm512_set1_epi32(256);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
let left_vec = _mm512_set1_epi32(left_val);
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm512_set1_epi32(w_v);
let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 16 <= width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
let top = _mm512_cvtepu16_epi32(top_u16);
let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
let w_h = _mm512_cvtepu8_epi32(wh_bytes);
let w_h_inv = _mm512_sub_epi32(c256, w_h);
let vert = _mm512_add_epi32(
_mm512_mullo_epi32(w_v_vec, top),
_mm512_mullo_epi32(w_v_inv, bottom_vec),
);
let hor = _mm512_add_epi32(
_mm512_mullo_epi32(w_h, left_vec),
_mm512_mullo_epi32(w_h_inv, right_vec),
);
let pred = _mm512_add_epi32(vert, hor);
let result = _mm512_srai_epi32::<9>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
let off = row_off + x * 2;
storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);
x += 16;
}
while x < width {
let top_byte_off = tl_off + (1 + x) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let w_h = weights_hor[x] as i32;
let pred =
w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(((pred + 256) >> 9) as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_ver = &dav1d_sm_weights[height..][..height];
let bottom_off = tl_off - height * 2;
let bottom_val =
u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
let bottom_vec = _mm512_set1_epi32(bottom_val);
let rounding = _mm512_set1_epi32(128);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm512_set1_epi32(w_v);
let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 16 <= width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
let top = _mm512_cvtepu16_epi32(top_u16);
let pred = _mm512_add_epi32(
_mm512_mullo_epi32(w_v_vec, top),
_mm512_mullo_epi32(w_v_inv, bottom_vec),
);
let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
let off = row_off + x * 2;
storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);
x += 16;
}
while x < width {
let top_byte_off = tl_off + (1 + x) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let pred = (w_v * top_val + (256 - w_v) * bottom_val + 128) >> 8;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let right_off = tl_off + width * 2;
let right_val =
u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
let right_vec = _mm512_set1_epi32(right_val);
let rounding = _mm512_set1_epi32(128);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
let left_vec = _mm512_set1_epi32(left_val);
let mut x = 0;
while x + 16 <= width {
let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
let w_h = _mm512_cvtepu8_epi32(wh_bytes);
let w_h_inv = _mm512_sub_epi32(c256, w_h);
let pred = _mm512_add_epi32(
_mm512_mullo_epi32(w_h, left_vec),
_mm512_mullo_epi32(w_h_inv, right_vec),
);
let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
let off = row_off + x * 2;
storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);
x += 16;
}
while x < width {
let w_h = weights_hor[x] as i32;
let pred = (w_h * left_val + (256 - w_h) * right_val + 128) >> 8;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let topleft_val = u16::from_ne_bytes(topleft[tl_off..tl_off + 2].try_into().unwrap()) as i32;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
for x in 0..width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let base = left_val + top_val - topleft_val;
let l_diff = (left_val - base).abs();
let t_diff = (top_val - base).abs();
let tl_diff = (topleft_val - base).abs();
let pred = if l_diff <= t_diff && l_diff <= tl_diff {
left_val
} else if t_diff <= tl_diff {
top_val
} else {
topleft_val
};
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_paeth_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_paeth_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let weights_ver = &dav1d_sm_weights[height..][..height];
let right_off = tl_off + width * 2;
let right_val =
u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
let bottom_off = tl_off - height * 2;
let bottom_val =
u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
let w_v = weights_ver[y] as i32;
for x in 0..width {
let top_byte_off = tl_off + (1 + x) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let w_h = weights_hor[x] as i32;
let vert = w_v * top_val + (256 - w_v) * bottom_val;
let horz = w_h * left_val + (256 - w_h) * right_val;
let pred = (vert + horz + 256) >> 9;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_smooth_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_ver = &dav1d_sm_weights[height..][..height];
let bottom_off = tl_off - height * 2;
let bottom_val =
u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let w_v = weights_ver[y] as i32;
for x in 0..width {
let top_byte_off = tl_off + (1 + x) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let pred = (w_v * top_val + (256 - w_v) * bottom_val + 128) >> 8;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_v_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_smooth_v_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let right_off = tl_off + width * 2;
let right_val =
u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
for x in 0..width {
let w_h = weights_hor[x] as i32;
let pred = (w_h * left_val + (256 - w_h) * right_val + 128) >> 8;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_h_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_smooth_h_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z1_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dx = dav1d_dr_intra_derivative[(angle >> 1) as usize] as i32;
let tl_pix = tl_off / 2;
let rd = |off: usize| -> u16 {
let b = off * 2;
u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
};
let upsample_above = enable_intra_edge_filter
&& (90 - angle) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut top_px = [0u16; 64 + 64];
let (max_base_x, base_inc);
if upsample_above {
let kernel: [i8; 4] = [-1, 9, 9, -1];
let hsz = width_i + height_i;
let in_off = tl_pix + 1;
let from = -1i32;
let to = width_i + std::cmp::min(width_i, height_i);
for i in 0..hsz - 1 {
top_px[(i * 2) as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
let mut s = 0i32;
for j in 0..4i32 {
s += rd(in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize))
as i32
* kernel[j as usize] as i32;
}
top_px[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
let i = hsz - 1;
top_px[(i * 2) as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
dx <<= 1;
max_base_x = (2 * (width_i + height_i) - 2) as usize;
base_inc = 2usize;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 90 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let in_off = tl_pix + 1;
let from = -1i32;
let to = width_i + std::cmp::min(width_i, height_i);
let lim_from = 0i32;
let lim_to = width_i + height_i;
let mut i = 0i32;
while i < std::cmp::min(width_i + height_i, lim_from) {
top_px[i as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
while i < std::cmp::min(lim_to, width_i + height_i) {
let mut s = 0i32;
for j in 0..5i32 {
s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
as i32
* KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
}
top_px[i as usize] = ((s + 8) >> 4) as u16;
i += 1;
}
while i < width_i + height_i {
top_px[i as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
max_base_x = (width_i + height_i - 1) as usize;
} else {
for i in 0..width + std::cmp::min(width, height) {
top_px[i] = rd(tl_pix + 1 + i);
}
max_base_x = width + std::cmp::min(width, height) - 1;
}
base_inc = 1;
};
let top_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(&top_px[..]);
let top_bytes = top_bytes.flex();
let rounding = _mm256_set1_epi32(32);
for y in 0..height_i {
let xpos = (y + 1) * dx;
let frac = (xpos & 0x3e) as i32;
let inv_frac = 64 - frac;
let frac_vec = _mm256_set1_epi32(frac);
let inv_frac_vec = _mm256_set1_epi32(inv_frac);
let row_off = (dst_base as isize + y as isize * stride) as usize;
let base0 = (xpos >> 6) as usize;
let mut x = 0usize;
if base_inc == 1 {
while x + 8 <= width && base0 + x + 8 < max_base_x {
let base = base0 + x;
let load0 = base * 2;
let load1 = (base + 1) * 2;
let t0 = loadu_128!((&top_bytes[load0..load0 + 16]), [u8; 16]);
let t1 = loadu_128!((&top_bytes[load1..load1 + 16]), [u8; 16]);
let t0_w = _mm256_cvtepu16_epi32(t0);
let t1_w = _mm256_cvtepu16_epi32(t1);
let prod0 = _mm256_mullo_epi32(t0_w, inv_frac_vec);
let prod1 = _mm256_mullo_epi32(t1_w, frac_vec);
let sum = _mm256_add_epi32(_mm256_add_epi32(prod0, prod1), rounding);
let result = _mm256_srai_epi32::<6>(sum);
let packed = _mm256_packus_epi32(result, result);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi64(lo, hi);
let store_off = row_off + x * 2;
storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], combined);
x += 8;
}
}
while x < width {
let base = base0 + base_inc * x;
if base < max_base_x {
let t0 = top_px[base] as i32;
let t1 = top_px[base + 1] as i32;
let v = t0 * inv_frac + t1 * frac;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
} else {
let fill_val = top_px[max_base_x];
for xx in x..width {
let off = row_off + xx * 2;
dst[off..off + 2].copy_from_slice(&fill_val.to_ne_bytes());
}
break;
}
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z1_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_z1_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
_bitdepth_max as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z2_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
max_width: i32,
max_height: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((angle - 90) >> 1) as usize] as i32;
let mut dx = dav1d_dr_intra_derivative[((180 - angle) >> 1) as usize] as i32;
let upsample_left = enable_intra_edge_filter
&& (180 - angle) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let upsample_above = enable_intra_edge_filter
&& (angle - 90) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut edge_px = [0u16; 64 + 64 + 1];
let edge_tl = 64usize;
let rd = |off: usize| -> u16 {
let b = off * 2;
u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
};
let tl_pix = tl_off / 2;
if upsample_above {
let kernel: [i8; 4] = [-1, 9, 9, -1];
let hsz = width_i + 1;
let in_off = tl_pix;
for i in 0..hsz - 1 {
edge_px[edge_tl + (i * 2) as usize] = rd(in_off + i.clamp(0, hsz - 1) as usize);
let mut s = 0i32;
for j in 0..4i32 {
s += rd(in_off + (i + j - 1).clamp(0, hsz - 1) as usize) as i32
* kernel[j as usize] as i32;
}
edge_px[edge_tl + (i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
let i = hsz - 1;
edge_px[edge_tl + (i * 2) as usize] = rd(in_off + i.clamp(0, hsz - 1) as usize);
dx <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 90, is_sm)
} else {
0
};
if filter_strength != 0 {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let in_off = tl_pix + 1;
let from = -1i32;
let to = width_i;
let lim_from = 0i32;
let lim_to = max_width;
let mut i = 0i32;
while i < std::cmp::min(width_i, lim_from) {
edge_px[edge_tl + 1 + i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
while i < std::cmp::min(lim_to, width_i) {
let mut s = 0i32;
for j in 0..5i32 {
s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
as i32
* KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
}
edge_px[edge_tl + 1 + i as usize] = ((s + 8) >> 4) as u16;
i += 1;
}
while i < width_i {
edge_px[edge_tl + 1 + i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
} else {
for i in 0..width {
edge_px[edge_tl + 1 + i] = rd(tl_pix + 1 + i);
}
}
}
if upsample_left {
let kernel: [i8; 4] = [-1, 9, 9, -1];
let hsz = height_i + 1;
let in_off = tl_pix - height;
for i in 0..hsz - 1 {
edge_px[edge_tl - height * 2 + (i * 2) as usize] =
rd(in_off + i.clamp(0, hsz - 1) as usize);
let mut s = 0i32;
for j in 0..4i32 {
s += rd(in_off + (i + j - 1).clamp(0, hsz - 1) as usize) as i32
* kernel[j as usize] as i32;
}
edge_px[edge_tl - height * 2 + (i * 2 + 1) as usize] =
((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
let i = hsz - 1;
edge_px[edge_tl - height * 2 + (i * 2) as usize] =
rd(in_off + i.clamp(0, hsz - 1) as usize);
dy <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 180 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let in_off = tl_pix - height;
let from = 0i32;
let to = height_i + 1;
let lim_from = height_i - max_height;
let lim_to = height_i;
let mut i = 0i32;
while i < std::cmp::min(height_i, lim_from) {
edge_px[edge_tl - height + i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
while i < std::cmp::min(lim_to, height_i) {
let mut s = 0i32;
for j in 0..5i32 {
s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
as i32
* KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
}
edge_px[edge_tl - height + i as usize] = ((s + 8) >> 4) as u16;
i += 1;
}
while i < height_i {
edge_px[edge_tl - height + i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
} else {
for i in 0..height {
edge_px[edge_tl - height + i] = rd(tl_pix - height + i);
}
}
}
edge_px[edge_tl] = rd(tl_pix);
let edge_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(edge_px.as_slice());
let edge = edge_bytes.flex();
let base_inc_x = 1 + upsample_above as usize;
let left = edge_tl - (1 + upsample_left as usize);
let rounding = _mm256_set1_epi32(32);
for y in 0..height_i {
let xpos = ((1 + upsample_above as i32) << 6) - dx * (y + 1);
let base_x0 = xpos >> 6;
let frac_x = (xpos & 0x3e) as i32;
let inv_frac_x = 64 - frac_x;
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_count = if base_x0 >= 0 {
0usize
} else {
let needed = (-base_x0) as usize;
needed.div_ceil(base_inc_x).min(width)
};
let mut x = 0usize;
while x < left_count {
let ypos = (y << (6 + upsample_left as i32)) - dy * (x as i32 + 1);
let base_y = ypos >> 6;
let frac_y = ypos & 0x3e;
let inv_frac_y = 64 - frac_y;
let l0_pix = left.wrapping_add_signed(-base_y as isize);
let l1_pix = left.wrapping_add_signed(-(base_y + 1) as isize);
let l0_off = l0_pix * 2;
let l1_off = l1_pix * 2;
let l0 = u16::from_ne_bytes(edge[l0_off..l0_off + 2].try_into().unwrap()) as i32;
let l1 = u16::from_ne_bytes(edge[l1_off..l1_off + 2].try_into().unwrap()) as i32;
let v = l0 * inv_frac_y + l1 * frac_y;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
x += 1;
}
if base_inc_x == 1 {
while x + 8 <= width {
let base_x = (base_x0 + x as i32) as usize;
let load0 = (edge_tl + base_x) * 2;
let load1 = (edge_tl + base_x + 1) * 2;
if load1 + 16 > edge.len() {
break;
}
let t0 = loadu_128!((&edge[load0..load0 + 16]), [u8; 16]);
let t1 = loadu_128!((&edge[load1..load1 + 16]), [u8; 16]);
let t0_w = _mm256_cvtepu16_epi32(t0);
let t1_w = _mm256_cvtepu16_epi32(t1);
let frac_vec = _mm256_set1_epi32(frac_x);
let inv_frac_vec = _mm256_set1_epi32(inv_frac_x);
let prod0 = _mm256_mullo_epi32(t0_w, inv_frac_vec);
let prod1 = _mm256_mullo_epi32(t1_w, frac_vec);
let sum = _mm256_add_epi32(_mm256_add_epi32(prod0, prod1), rounding);
let result = _mm256_srai_epi32::<6>(sum);
let packed = _mm256_packus_epi32(result, result);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi64(lo, hi);
let store_off = row_off + x * 2;
storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], combined);
x += 8;
}
}
while x < width {
let base_x = (base_x0 + (base_inc_x * x) as i32) as usize;
let t0_off = (edge_tl + base_x) * 2;
let t1_off = (edge_tl + base_x + 1) * 2;
if t1_off + 2 > edge.len() {
break;
}
let t0 = u16::from_ne_bytes(edge[t0_off..t0_off + 2].try_into().unwrap()) as i32;
let t1 = u16::from_ne_bytes(edge[t1_off..t1_off + 2].try_into().unwrap()) as i32;
let v = t0 * inv_frac_x + t1 * frac_x;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z2_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
max_width: c_int,
max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_z2_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
max_width as i32,
max_height as i32,
_bitdepth_max as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z3_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((270 - angle) >> 1) as usize] as usize;
let tl_pix = tl_off / 2;
let rd = |off: usize| -> u16 {
let b = off * 2;
u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
};
let upsample_left = enable_intra_edge_filter
&& (angle - 180) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut left_px = [0u16; 64 + 64];
let (left_off, max_base_y, base_inc);
let use_left_px;
if upsample_left {
let kernel: [i8; 4] = [-1, 9, 9, -1];
let hsz = width_i + height_i;
let in_off = tl_pix - (width + height);
let from = std::cmp::max(width_i - height_i, 0);
let to = width_i + height_i + 1;
for i in 0..hsz - 1 {
left_px[(i * 2) as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
let mut s = 0i32;
for j in 0..4i32 {
s += rd(in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize))
as i32
* kernel[j as usize] as i32;
}
left_px[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
let i = hsz - 1;
left_px[(i * 2) as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
left_off = (2 * (width_i + height_i) - 2) as usize;
max_base_y = left_off;
dy <<= 1;
base_inc = 2usize;
use_left_px = true;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 180, is_sm)
} else {
0
};
if filter_strength != 0 {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let in_off = tl_pix - (width + height);
let from = std::cmp::max(width_i - height_i, 0);
let to = width_i + height_i + 1;
let lim_from = 0i32;
let lim_to = width_i + height_i;
let mut i = 0i32;
while i < std::cmp::min(width_i + height_i, lim_from) {
left_px[i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
while i < std::cmp::min(lim_to, width_i + height_i) {
let mut s = 0i32;
for j in 0..5i32 {
s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
as i32
* KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
}
left_px[i as usize] = ((s + 8) >> 4) as u16;
i += 1;
}
while i < width_i + height_i {
left_px[i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
left_off = (width_i + height_i - 1) as usize;
max_base_y = left_off;
use_left_px = true;
} else {
left_off = 0; max_base_y = height + std::cmp::min(width, height) - 1;
use_left_px = false;
}
base_inc = 1;
};
for x in 0..width {
let ypos = dy * (x + 1);
let frac = (ypos & 0x3e) as i32;
let inv_frac = 64 - frac;
for y in 0..height_i {
let base = (ypos >> 6) + base_inc * y as usize;
if base < max_base_y {
let (l0, l1) = if use_left_px {
(
left_px[left_off - base] as i32,
left_px[left_off - base - 1] as i32,
)
} else {
(rd(tl_pix - base - 1) as i32, rd(tl_pix - base - 2) as i32)
};
let v = l0 * inv_frac + l1 * frac;
let pixel_off = (dst_base as isize + y as isize * stride) as usize + x * 2;
dst[pixel_off..pixel_off + 2]
.copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
} else {
let fill_val = if use_left_px {
left_px[left_off - max_base_y]
} else {
rd(tl_pix - max_base_y - 1)
};
for yy in y..height_i {
let pixel_off = (dst_base as isize + yy as isize * stride) as usize + x * 2;
dst[pixel_off..pixel_off + 2].copy_from_slice(&fill_val.to_ne_bytes());
}
break;
}
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z3_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_z3_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
_bitdepth_max as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_filter_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
filt_idx: i32,
bitdepth_max: i32,
topleft_off: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let width = (width as usize / 4) * 4; let filt_idx = (filt_idx as usize) & 511;
let filter = &dav1d_filter_intra_taps[filt_idx];
for y in (0..height).step_by(2) {
let cur_tl_off = topleft_off - y;
let tl_pixel_off = tl_off.wrapping_add(cur_tl_off * 2);
let mut tl_pixel =
u16::from_ne_bytes(topleft[tl_pixel_off..tl_pixel_off + 2].try_into().unwrap()) as i32;
let row0_off = (dst_base as isize + y as isize * stride) as usize;
let row1_off = (dst_base as isize + (y + 1) as isize * stride) as usize;
for x in (0..width).step_by(4) {
let (p1, p2, p3, p4) = if y == 0 {
let top_base = tl_off.wrapping_add((topleft_off + 1 + x) * 2);
(
u16::from_ne_bytes(topleft[top_base..top_base + 2].try_into().unwrap()) as i32,
u16::from_ne_bytes(topleft[top_base + 2..top_base + 4].try_into().unwrap())
as i32,
u16::from_ne_bytes(topleft[top_base + 4..top_base + 6].try_into().unwrap())
as i32,
u16::from_ne_bytes(topleft[top_base + 6..top_base + 8].try_into().unwrap())
as i32,
)
} else {
let top_row = (dst_base as isize + (y as isize - 1) * stride) as usize;
let tb = top_row + x * 2;
(
u16::from_ne_bytes(dst[tb..tb + 2].try_into().unwrap()) as i32,
u16::from_ne_bytes(dst[tb + 2..tb + 4].try_into().unwrap()) as i32,
u16::from_ne_bytes(dst[tb + 4..tb + 6].try_into().unwrap()) as i32,
u16::from_ne_bytes(dst[tb + 6..tb + 8].try_into().unwrap()) as i32,
)
};
let (p5, p6) = if x == 0 {
let left_base = tl_off.wrapping_add(cur_tl_off.wrapping_sub(1) * 2);
let left_base2 = tl_off.wrapping_add(cur_tl_off.wrapping_sub(2) * 2);
(
u16::from_ne_bytes(topleft[left_base..left_base + 2].try_into().unwrap())
as i32,
u16::from_ne_bytes(topleft[left_base2..left_base2 + 2].try_into().unwrap())
as i32,
)
} else {
let p5_off = row0_off + (x - 1) * 2;
let p6_off = row1_off + (x - 1) * 2;
(
u16::from_ne_bytes(dst[p5_off..p5_off + 2].try_into().unwrap()) as i32,
u16::from_ne_bytes(dst[p6_off..p6_off + 2].try_into().unwrap()) as i32,
)
};
let p0 = tl_pixel;
let p = [p0, p1, p2, p3, p4, p5, p6];
let flt = filter.as_slice();
let mut flt_offset = 0;
for xx in 0..4 {
let acc = filter_fn(&flt[flt_offset..], p);
let val = ((acc + 8) >> 4).clamp(0, bitdepth_max as i32) as u16;
let off = row0_off + (x + xx) * 2;
dst[off..off + 2].copy_from_slice(&val.to_ne_bytes());
flt_offset += FLT_INCR;
}
for xx in 0..4 {
let acc = filter_fn(&flt[flt_offset..], p);
let val = ((acc + 8) >> 4).clamp(0, bitdepth_max as i32) as u16;
let off = row1_off + (x + xx) * 2;
dst[off..off + 2].copy_from_slice(&val.to_ne_bytes());
flt_offset += FLT_INCR;
}
tl_pixel = p4;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_filter_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
filt_idx: c_int,
_max_width: c_int,
_max_height: c_int,
bitdepth_max: c_int,
topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_filter_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
filt_idx as i32,
bitdepth_max as i32,
topleft_off,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_pred_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
ac: &[i16],
width: usize,
height: usize,
dc: i32,
alpha: i32,
) {
let mut dst = dst.flex_mut();
let alpha_v = _mm256_set1_epi32(alpha);
let dc_v = _mm256_set1_epi32(dc);
let c32 = _mm256_set1_epi32(32);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let ac_off = y * width;
if width >= 8 {
let mut x = 0;
while x + 8 <= width {
let ac128 = loadu_128!(&ac[ac_off + x..ac_off + x + 8], [i16; 8]);
let ac32 = _mm256_cvtepi16_epi32(ac128);
let diff = _mm256_mullo_epi32(ac32, alpha_v);
let abs_diff = _mm256_abs_epi32(diff);
let plus32 = _mm256_add_epi32(abs_diff, c32);
let shifted = _mm256_srli_epi32::<6>(plus32);
let signed = _mm256_sign_epi32(shifted, diff);
let result = _mm256_add_epi32(dc_v, signed);
let p16 = _mm256_packus_epi32(result, result);
let p16_ordered = _mm256_permute4x64_epi64::<0b_00_00_10_00>(p16);
let p16_lo = _mm256_castsi256_si128(p16_ordered); let p8 = _mm_packus_epi16(p16_lo, p16_lo); let dst_chunk: &mut [u8; 8] =
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap();
safe_unaligned_simd::x86_64::_mm_storeu_si64(dst_chunk, p8);
x += 8;
}
} else if width == 4 {
let mut buf = [0i16; 8];
buf[..4].copy_from_slice(&ac[ac_off..ac_off + 4]);
let ac128 = loadu_128!(&buf);
let ac32 = _mm256_cvtepi16_epi32(ac128);
let diff = _mm256_mullo_epi32(ac32, alpha_v);
let abs_diff = _mm256_abs_epi32(diff);
let plus32 = _mm256_add_epi32(abs_diff, c32);
let shifted = _mm256_srli_epi32::<6>(plus32);
let signed = _mm256_sign_epi32(shifted, diff);
let result = _mm256_add_epi32(dc_v, signed);
let p16 = _mm256_packus_epi32(result, result);
let p16_lo = _mm256_castsi256_si128(p16); let p8 = _mm_packus_epi16(p16_lo, p16_lo); let dst_chunk: &mut [u8; 4] = (&mut dst[row_off..row_off + 4]).try_into().unwrap();
safe_unaligned_simd::x86_64::_mm_storeu_si32(dst_chunk, p8);
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_pred_16bpc_inner(
_token: Desktop64,
dst: &mut [u8], dst_base: usize,
stride: isize,
ac: &[i16],
width: usize,
height: usize,
dc: i32,
alpha: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let alpha_v = _mm256_set1_epi32(alpha);
let dc_v = _mm256_set1_epi32(dc);
let c32 = _mm256_set1_epi32(32);
let max_v = _mm256_set1_epi32(bitdepth_max);
let zero_v = _mm256_setzero_si256();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let ac_off = y * width;
if width >= 8 {
let mut x = 0;
while x + 8 <= width {
let ac128 = loadu_128!(&ac[ac_off + x..ac_off + x + 8], [i16; 8]);
let ac32 = _mm256_cvtepi16_epi32(ac128);
let diff = _mm256_mullo_epi32(ac32, alpha_v);
let abs_diff = _mm256_abs_epi32(diff);
let plus32 = _mm256_add_epi32(abs_diff, c32);
let shifted = _mm256_srli_epi32::<6>(plus32);
let signed = _mm256_sign_epi32(shifted, diff);
let mut result = _mm256_add_epi32(dc_v, signed);
result = _mm256_max_epi32(result, zero_v);
result = _mm256_min_epi32(result, max_v);
let p16 = _mm256_packus_epi32(result, result);
let p16_ordered = _mm256_permute4x64_epi64::<0b_00_00_10_00>(p16);
let p16_lo = _mm256_castsi256_si128(p16_ordered); let byte_x = x * 2;
let dst_chunk: &mut [u8; 16] = (&mut dst[row_off + byte_x..row_off + byte_x + 16])
.try_into()
.unwrap();
storeu_128!(dst_chunk, p16_lo);
x += 8;
}
} else if width == 4 {
let mut buf = [0i16; 8];
buf[..4].copy_from_slice(&ac[ac_off..ac_off + 4]);
let ac128 = loadu_128!(&buf);
let ac32 = _mm256_cvtepi16_epi32(ac128);
let diff = _mm256_mullo_epi32(ac32, alpha_v);
let abs_diff = _mm256_abs_epi32(diff);
let plus32 = _mm256_add_epi32(abs_diff, c32);
let shifted = _mm256_srli_epi32::<6>(plus32);
let signed = _mm256_sign_epi32(shifted, diff);
let mut result = _mm256_add_epi32(dc_v, signed);
result = _mm256_max_epi32(result, zero_v);
result = _mm256_min_epi32(result, max_v);
let p16 = _mm256_packus_epi32(result, result);
let p16_lo = _mm256_castsi256_si128(p16); let dst_chunk: &mut [u8; 8] = (&mut dst[row_off..row_off + 8]).try_into().unwrap();
safe_unaligned_simd::x86_64::_mm_storeu_si64(dst_chunk, p16_lo);
}
}
}
use crate::include::common::bitdepth::BitDepth;
use crate::src::internal::SCRATCH_EDGE_LEN;
#[cfg(target_arch = "x86_64")]
pub fn intra_pred_dispatch<BD: BitDepth>(
mode: usize,
dst: PicOffset,
topleft: &[BD::Pixel; SCRATCH_EDGE_LEN],
topleft_off: usize,
width: c_int,
height: c_int,
angle: c_int,
max_width: c_int,
max_height: c_int,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
use zerocopy::IntoBytes;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
#[cfg(target_arch = "x86_64")]
let avx512_token = crate::src::cpu::summon_avx512();
#[cfg(not(target_arch = "x86_64"))]
let avx512_token: Option<Server64> = None;
#[cfg(target_arch = "x86_64")]
let avx512x_token = crate::src::cpu::summon_avx512x();
let w = width as usize;
let h = height as usize;
let bd_c = bd.into_c();
let tl_bytes: &[u8] = topleft.as_bytes();
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w,
h,
|dst_bytes, dst_base_bytes, byte_stride| {
match (BD::BPC, mode) {
(BPC::BPC8, 0) => {
if let Some(t512) = avx512_token {
ipred_dc_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_dc_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 1) => {
if let Some(t512) = avx512_token {
ipred_v_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_v_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 2) => {
if let Some(t512) = avx512_token {
ipred_h_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_h_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 3) => {
if let Some(t512) = avx512_token {
ipred_dc_left_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_dc_left_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 4) => {
if let Some(t512) = avx512_token {
ipred_dc_top_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_dc_top_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 5) => {
if let Some(t512) = avx512_token {
ipred_dc_128_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
w,
h,
)
} else {
ipred_dc_128_8bpc_inner(token, dst_bytes, dst_base_bytes, byte_stride, w, h)
}
}
(BPC::BPC8, 6) => {
if let Some(t512x) = avx512x_token {
ipred_z1_8bpc_v4x_inner(
t512x,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
);
} else {
ipred_z1_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
);
}
}
(BPC::BPC8, 7) => {
if let Some(t512x) = avx512x_token {
ipred_z2_8bpc_v4x_inner(
t512x,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
max_width,
max_height,
);
} else {
ipred_z2_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
max_width,
max_height,
);
}
}
(BPC::BPC8, 8) => {
if let Some(t512x) = avx512x_token {
ipred_z3_8bpc_v4x_inner(
t512x,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
);
} else {
ipred_z3_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
);
}
}
(BPC::BPC8, 9) => {
if let Some(t512) = avx512_token {
ipred_smooth_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_smooth_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 10) => {
if let Some(t512) = avx512_token {
ipred_smooth_v_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_smooth_v_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 11) => {
if let Some(t512) = avx512_token {
ipred_smooth_h_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_smooth_h_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 12) => {
if let Some(t512) = avx512_token {
ipred_paeth_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_paeth_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 13) => {
ipred_filter_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
0, w,
h,
angle as i32,
topleft_off,
)
}
(BPC::BPC16, 0) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_dc_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_dc_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 1) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_v_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_v_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 2) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_h_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_h_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 3) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_dc_left_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_dc_left_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 4) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_dc_top_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_dc_top_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 5) => {
if let Some(t512) = avx512_token {
ipred_dc_128_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
w,
h,
bd_c as i32,
)
} else {
ipred_dc_128_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
w,
h,
bd_c as i32,
)
}
}
(BPC::BPC16, 6) => {
let tl_off_bytes = topleft_off * 2;
ipred_z1_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
angle as i32,
bd_c,
);
}
(BPC::BPC16, 7) => {
let tl_off_bytes = topleft_off * 2;
ipred_z2_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
angle as i32,
max_width,
max_height,
bd_c,
);
}
(BPC::BPC16, 8) => {
let tl_off_bytes = topleft_off * 2;
ipred_z3_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
angle as i32,
bd_c,
);
}
(BPC::BPC16, 9) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_smooth_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_smooth_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 10) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_smooth_v_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_smooth_v_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 11) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_smooth_h_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_smooth_h_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 12) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_paeth_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_paeth_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 13) => {
ipred_filter_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
0, w,
h,
angle as i32,
bd_c as i32,
topleft_off,
)
}
_ => return false,
}
true
},
) }
#[cfg(target_arch = "x86_64")]
pub fn cfl_pred_dispatch<BD: BitDepth>(
dst: PicOffset,
width: c_int,
height: c_int,
dc: c_int,
ac: &[i16],
alpha: c_int,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::AsPrimitive;
use crate::include::common::bitdepth::BPC;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let w = width as usize;
let h = height as usize;
if !(w == 4 || w == 8 || w == 16 || w == 32 || w == 64) {
return false;
}
let ac_slice = &ac[..w * h];
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w,
h,
|dst_bytes, dst_base_bytes, byte_stride| match BD::BPC {
BPC::BPC8 => cfl_pred_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
ac_slice,
w,
h,
dc,
alpha,
),
BPC::BPC16 => cfl_pred_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
ac_slice,
w,
h,
dc,
alpha,
bd.bitdepth_max().as_::<i32>(),
),
},
);
true
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_ac_420_8bpc_inner(
_token: Desktop64,
ac: &mut [i16],
width: usize,
height: usize,
active_w: usize,
active_h: usize,
src_bytes: &[u8],
src_base: usize,
src_stride: isize,
) {
let ones = _mm256_set1_epi8(1);
for y in 0..active_h {
let aci = y * width;
let row1_off = (src_base as isize + (2 * y) as isize * src_stride) as usize;
let row2_off = (src_base as isize + (2 * y + 1) as isize * src_stride) as usize;
let mut x = 0;
while x + 16 <= active_w {
let lx = 2 * x;
let r1 = loadu_256!(
<&[u8; 32]>::try_from(&src_bytes[row1_off + lx..row1_off + lx + 32]).unwrap()
);
let r2 = loadu_256!(
<&[u8; 32]>::try_from(&src_bytes[row2_off + lx..row2_off + lx + 32]).unwrap()
);
let s1 = _mm256_maddubs_epi16(r1, ones); let s2 = _mm256_maddubs_epi16(r2, ones);
let sum = _mm256_add_epi16(s1, s2);
let shifted = _mm256_slli_epi16::<1>(sum);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut ac[aci + x..aci + x + 16]).unwrap(),
shifted
);
x += 16;
}
while x + 8 <= active_w {
let lx = 2 * x;
let r1 = loadu_128!(
<&[u8; 16]>::try_from(&src_bytes[row1_off + lx..row1_off + lx + 16]).unwrap()
);
let r2 = loadu_128!(
<&[u8; 16]>::try_from(&src_bytes[row2_off + lx..row2_off + lx + 16]).unwrap()
);
let ones128 = _mm_set1_epi8(1);
let s1 = _mm_maddubs_epi16(r1, ones128);
let s2 = _mm_maddubs_epi16(r2, ones128);
let sum = _mm_add_epi16(s1, s2);
let shifted = _mm_slli_epi16::<1>(sum);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut ac[aci + x..aci + x + 8]).unwrap(),
shifted
);
x += 8;
}
while x < active_w {
let lx = 2 * x;
let a = src_bytes[row1_off + lx] as i32;
let b = src_bytes[row1_off + lx + 1] as i32;
let c = src_bytes[row2_off + lx] as i32;
let d = src_bytes[row2_off + lx + 1] as i32;
ac[aci + x] = ((a + b + c + d) << 1) as i16;
x += 1;
}
if active_w < width {
let pad = ac[aci + active_w - 1];
for x in active_w..width {
ac[aci + x] = pad;
}
}
}
if active_h < height {
let src_row_start = (active_h - 1) * width;
for y in active_h..height {
let dst_off = y * width;
ac.copy_within(src_row_start..src_row_start + width, dst_off);
}
}
let n = width * height;
let log2sz = (width.trailing_zeros() + height.trailing_zeros()) as i32;
let mut sum_i32 = 1i32 << log2sz >> 1; {
let mut acc = _mm256_setzero_si256();
let mut i = 0;
while i + 16 <= n {
let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
let lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v));
let hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v));
acc = _mm256_add_epi32(acc, lo);
acc = _mm256_add_epi32(acc, hi);
i += 16;
}
let acc_lo = _mm256_castsi256_si128(acc);
let acc_hi = _mm256_extracti128_si256::<1>(acc);
let s128 = _mm_add_epi32(acc_lo, acc_hi);
let s64 = _mm_add_epi32(s128, _mm_shuffle_epi32::<0b_01_00_11_10>(s128));
let s32 = _mm_add_epi32(s64, _mm_shuffle_epi32::<0b_00_00_00_01>(s64));
sum_i32 = sum_i32.wrapping_add(_mm_cvtsi128_si32(s32));
while i < n {
sum_i32 = sum_i32.wrapping_add(ac[i] as i32);
i += 1;
}
}
let mean = (sum_i32 >> log2sz) as i16;
{
let mean_v = _mm256_set1_epi16(mean);
let mut i = 0;
while i + 16 <= n {
let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
let r = _mm256_sub_epi16(v, mean_v);
storeu_256!(<&mut [i16; 16]>::try_from(&mut ac[i..i + 16]).unwrap(), r);
i += 16;
}
while i < n {
ac[i] = ac[i].wrapping_sub(mean);
i += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_ac_422_8bpc_inner(
_token: Desktop64,
ac: &mut [i16],
width: usize,
height: usize,
active_w: usize,
active_h: usize,
src_bytes: &[u8],
src_base: usize,
src_stride: isize,
) {
let ones = _mm256_set1_epi8(1);
for y in 0..active_h {
let aci = y * width;
let row_off = (src_base as isize + y as isize * src_stride) as usize;
let mut x = 0;
while x + 16 <= active_w {
let lx = 2 * x;
let r1 = loadu_256!(
<&[u8; 32]>::try_from(&src_bytes[row_off + lx..row_off + lx + 32]).unwrap()
);
let s1 = _mm256_maddubs_epi16(r1, ones);
let shifted = _mm256_slli_epi16::<2>(s1);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut ac[aci + x..aci + x + 16]).unwrap(),
shifted
);
x += 16;
}
while x + 8 <= active_w {
let lx = 2 * x;
let r1 = loadu_128!(
<&[u8; 16]>::try_from(&src_bytes[row_off + lx..row_off + lx + 16]).unwrap()
);
let ones128 = _mm_set1_epi8(1);
let s1 = _mm_maddubs_epi16(r1, ones128);
let shifted = _mm_slli_epi16::<2>(s1);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut ac[aci + x..aci + x + 8]).unwrap(),
shifted
);
x += 8;
}
while x < active_w {
let lx = 2 * x;
let a = src_bytes[row_off + lx] as i32;
let b = src_bytes[row_off + lx + 1] as i32;
ac[aci + x] = ((a + b) << 2) as i16;
x += 1;
}
if active_w < width {
let pad = ac[aci + active_w - 1];
for x in active_w..width {
ac[aci + x] = pad;
}
}
}
if active_h < height {
let src_row_start = (active_h - 1) * width;
for y in active_h..height {
let dst_off = y * width;
ac.copy_within(src_row_start..src_row_start + width, dst_off);
}
}
let n = width * height;
let log2sz = (width.trailing_zeros() + height.trailing_zeros()) as i32;
let mut sum_i32 = 1i32 << log2sz >> 1;
{
let mut acc = _mm256_setzero_si256();
let mut i = 0;
while i + 16 <= n {
let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
let lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v));
let hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v));
acc = _mm256_add_epi32(acc, lo);
acc = _mm256_add_epi32(acc, hi);
i += 16;
}
let acc_lo = _mm256_castsi256_si128(acc);
let acc_hi = _mm256_extracti128_si256::<1>(acc);
let s128 = _mm_add_epi32(acc_lo, acc_hi);
let s64 = _mm_add_epi32(s128, _mm_shuffle_epi32::<0b_01_00_11_10>(s128));
let s32 = _mm_add_epi32(s64, _mm_shuffle_epi32::<0b_00_00_00_01>(s64));
sum_i32 = sum_i32.wrapping_add(_mm_cvtsi128_si32(s32));
while i < n {
sum_i32 = sum_i32.wrapping_add(ac[i] as i32);
i += 1;
}
}
let mean = (sum_i32 >> log2sz) as i16;
{
let mean_v = _mm256_set1_epi16(mean);
let mut i = 0;
while i + 16 <= n {
let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
let r = _mm256_sub_epi16(v, mean_v);
storeu_256!(<&mut [i16; 16]>::try_from(&mut ac[i..i + 16]).unwrap(), r);
i += 16;
}
while i < n {
ac[i] = ac[i].wrapping_sub(mean);
i += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_ac_444_8bpc_inner(
_token: Desktop64,
ac: &mut [i16],
width: usize,
height: usize,
active_w: usize,
active_h: usize,
src_bytes: &[u8],
src_base: usize,
src_stride: isize,
) {
for y in 0..active_h {
let aci = y * width;
let row_off = (src_base as isize + y as isize * src_stride) as usize;
let mut x = 0;
while x + 16 <= active_w {
let r1 = loadu_128!(
<&[u8; 16]>::try_from(&src_bytes[row_off + x..row_off + x + 16]).unwrap()
);
let widened = _mm256_cvtepu8_epi16(r1);
let shifted = _mm256_slli_epi16::<3>(widened);
storeu_256!(
<&mut [i16; 16]>::try_from(&mut ac[aci + x..aci + x + 16]).unwrap(),
shifted
);
x += 16;
}
while x + 8 <= active_w {
let arr: &[u8; 8] = (&src_bytes[row_off + x..row_off + x + 8])
.try_into()
.unwrap();
let mut buf = [0u8; 16];
buf[..8].copy_from_slice(arr);
let r1 = loadu_128!(&buf);
let widened = _mm_cvtepu8_epi16(r1);
let shifted = _mm_slli_epi16::<3>(widened);
storeu_128!(
<&mut [i16; 8]>::try_from(&mut ac[aci + x..aci + x + 8]).unwrap(),
shifted
);
x += 8;
}
while x < active_w {
ac[aci + x] = (src_bytes[row_off + x] as i16) << 3;
x += 1;
}
if active_w < width {
let pad = ac[aci + active_w - 1];
for x in active_w..width {
ac[aci + x] = pad;
}
}
}
if active_h < height {
let src_row_start = (active_h - 1) * width;
for y in active_h..height {
let dst_off = y * width;
ac.copy_within(src_row_start..src_row_start + width, dst_off);
}
}
let n = width * height;
let log2sz = (width.trailing_zeros() + height.trailing_zeros()) as i32;
let mut sum_i32 = 1i32 << log2sz >> 1;
{
let mut acc = _mm256_setzero_si256();
let mut i = 0;
while i + 16 <= n {
let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
let lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v));
let hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v));
acc = _mm256_add_epi32(acc, lo);
acc = _mm256_add_epi32(acc, hi);
i += 16;
}
let acc_lo = _mm256_castsi256_si128(acc);
let acc_hi = _mm256_extracti128_si256::<1>(acc);
let s128 = _mm_add_epi32(acc_lo, acc_hi);
let s64 = _mm_add_epi32(s128, _mm_shuffle_epi32::<0b_01_00_11_10>(s128));
let s32 = _mm_add_epi32(s64, _mm_shuffle_epi32::<0b_00_00_00_01>(s64));
sum_i32 = sum_i32.wrapping_add(_mm_cvtsi128_si32(s32));
while i < n {
sum_i32 = sum_i32.wrapping_add(ac[i] as i32);
i += 1;
}
}
let mean = (sum_i32 >> log2sz) as i16;
{
let mean_v = _mm256_set1_epi16(mean);
let mut i = 0;
while i + 16 <= n {
let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
let r = _mm256_sub_epi16(v, mean_v);
storeu_256!(<&mut [i16; 16]>::try_from(&mut ac[i..i + 16]).unwrap(), r);
i += 16;
}
while i < n {
ac[i] = ac[i].wrapping_sub(mean);
i += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
pub fn cfl_ac_dispatch<BD: BitDepth>(
ac: &mut [i16],
y_src: PicOffset,
w_pad: c_int,
h_pad: c_int,
width: usize,
height: usize,
is_ss_hor: bool,
is_ss_ver: bool,
) -> bool {
use crate::include::common::bitdepth::BPC;
use crate::include::dav1d::picture::tile_threading_active;
use crate::src::strided::Strided as _;
use zerocopy::IntoBytes;
if BD::BPC != BPC::BPC8 {
return false;
}
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
let w_pad = (w_pad as usize) * 4;
let h_pad = (h_pad as usize) * 4;
debug_assert!(w_pad < width);
debug_assert!(h_pad < height);
let active_w = width - w_pad;
let active_h = height - h_pad;
let ss_hor = is_ss_hor as usize;
let ss_ver = is_ss_ver as usize;
let src_w = active_w << ss_hor;
let src_h = active_h << ss_ver;
let ac_block = &mut ac[..width * height];
if tile_threading_active() {
let (src_compact, src_stride) = y_src.compact_read_per_row::<BD>(src_w, src_h);
let src_stride_i = src_stride as isize;
if is_ss_hor && is_ss_ver {
cfl_ac_420_8bpc_inner(
token,
ac_block,
width,
height,
active_w,
active_h,
&src_compact,
0,
src_stride_i,
);
} else if is_ss_hor && !is_ss_ver {
cfl_ac_422_8bpc_inner(
token,
ac_block,
width,
height,
active_w,
active_h,
&src_compact,
0,
src_stride_i,
);
} else {
cfl_ac_444_8bpc_inner(
token,
ac_block,
width,
height,
active_w,
active_h,
&src_compact,
0,
src_stride_i,
);
}
} else {
let (src_guard, src_base) = y_src.narrow_guard::<BD>(src_w, src_h);
let src_bytes: &[u8] = src_guard.as_bytes();
let src_stride_i = y_src.data.stride();
if is_ss_hor && is_ss_ver {
cfl_ac_420_8bpc_inner(
token,
ac_block,
width,
height,
active_w,
active_h,
src_bytes,
src_base,
src_stride_i,
);
} else if is_ss_hor && !is_ss_ver {
cfl_ac_422_8bpc_inner(
token,
ac_block,
width,
height,
active_w,
active_h,
src_bytes,
src_base,
src_stride_i,
);
} else {
cfl_ac_444_8bpc_inner(
token,
ac_block,
width,
height,
active_w,
active_h,
src_bytes,
src_base,
src_stride_i,
);
}
}
true
}
#[cfg(all(test, target_arch = "x86_64"))]
mod v4x_dir_tests {
use super::*;
fn make_topleft() -> (Vec<u8>, usize) {
let total = 512usize;
let tl_off = 200usize;
let mut buf = vec![0u8; total];
let mut st: u32 = 0x1234_5678;
for b in buf.iter_mut() {
st ^= st << 13;
st ^= st >> 17;
st ^= st << 5;
*b = (st >> 3) as u8;
}
(buf, tl_off)
}
fn run_z1(w: usize, h: usize, angle: i32) -> (Vec<u8>, Vec<u8>) {
let (tl, tl_off) = make_topleft();
let stride = 64isize;
let mut dst_a = vec![7u8; 64 * 64];
let mut dst_b = vec![7u8; 64 * 64];
let t3 = crate::src::cpu::summon_avx2().expect("avx2");
let t4x = crate::src::cpu::summon_avx512x().expect("v4x");
ipred_z1_8bpc_inner(t3, &mut dst_a, 0, stride, &tl, tl_off, w, h, angle);
ipred_z1_8bpc_v4x_inner(t4x, &mut dst_b, 0, stride, &tl, tl_off, w, h, angle);
(dst_a, dst_b)
}
fn run_z3(w: usize, h: usize, angle: i32) -> (Vec<u8>, Vec<u8>) {
let (tl, tl_off) = make_topleft();
let stride = 64isize;
let mut dst_a = vec![7u8; 64 * 64];
let mut dst_b = vec![7u8; 64 * 64];
let t3 = crate::src::cpu::summon_avx2().expect("avx2");
let t4x = crate::src::cpu::summon_avx512x().expect("v4x");
ipred_z3_8bpc_inner(t3, &mut dst_a, 0, stride, &tl, tl_off, w, h, angle);
ipred_z3_8bpc_v4x_inner(t4x, &mut dst_b, 0, stride, &tl, tl_off, w, h, angle);
(dst_a, dst_b)
}
#[allow(clippy::too_many_arguments)]
fn run_z2(w: usize, h: usize, angle: i32, mw: i32, mh: i32) -> (Vec<u8>, Vec<u8>) {
let (tl, tl_off) = make_topleft();
let stride = 64isize;
let mut dst_a = vec![7u8; 64 * 64];
let mut dst_b = vec![7u8; 64 * 64];
let t3 = crate::src::cpu::summon_avx2().expect("avx2");
let t4x = crate::src::cpu::summon_avx512x().expect("v4x");
ipred_z2_8bpc_inner(t3, &mut dst_a, 0, stride, &tl, tl_off, w, h, angle, mw, mh);
ipred_z2_8bpc_v4x_inner(t4x, &mut dst_b, 0, stride, &tl, tl_off, w, h, angle, mw, mh);
(dst_a, dst_b)
}
fn assert_block_eq(a: &[u8], b: &[u8], w: usize, h: usize, stride: usize, label: &str) {
for y in 0..h {
for x in 0..w {
let off = y * stride + x;
assert_eq!(
a[off], b[off],
"{label}: mismatch at ({x},{y}) avx2={} v4x={}",
a[off], b[off]
);
}
}
}
#[test]
fn z1_v4x_matches_avx2() {
if crate::src::cpu::summon_avx512x().is_none() {
eprintln!("z1_v4x_matches_avx2: X64V4xToken unavailable, skipping (AVX2 path used)");
return;
}
let base_angles = [3, 6, 14, 22, 30, 36, 44, 52, 60, 66, 74, 82, 86];
let flag_sets = [0i32, 1 << 10, (1 << 10) | (1 << 9)];
let dims = [
(4, 4),
(4, 8),
(8, 4),
(8, 8),
(8, 16),
(16, 8),
(16, 16),
(16, 32),
(32, 16),
(32, 32),
(32, 64),
(64, 32),
(64, 64),
(4, 16),
(16, 4),
];
for &(w, h) in &dims {
for &ba in &base_angles {
for &fl in &flag_sets {
let angle = ba | fl;
let (a, b) = run_z1(w, h, angle);
assert_block_eq(&a, &b, w, h, 64, &format!("z1 w={w} h={h} angle={angle}"));
}
}
}
}
#[test]
fn z3_v4x_matches_avx2() {
if crate::src::cpu::summon_avx512x().is_none() {
eprintln!("z3_v4x_matches_avx2: X64V4xToken unavailable, skipping (AVX2 path used)");
return;
}
let base_angles = [184, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 266];
let flag_sets = [0i32, 1 << 10, (1 << 10) | (1 << 9)];
let dims = [
(4, 4),
(4, 8),
(8, 4),
(8, 8),
(8, 16),
(16, 8),
(16, 16),
(16, 32),
(32, 16),
(32, 32),
(32, 64),
(64, 32),
(64, 64),
(4, 16),
(16, 4),
];
for &(w, h) in &dims {
for &ba in &base_angles {
for &fl in &flag_sets {
let angle = ba | fl;
let (a, b) = run_z3(w, h, angle);
assert_block_eq(&a, &b, w, h, 64, &format!("z3 w={w} h={h} angle={angle}"));
}
}
}
}
#[test]
fn z2_v4x_matches_avx2() {
if crate::src::cpu::summon_avx512x().is_none() {
eprintln!("z2_v4x_matches_avx2: X64V4xToken unavailable, skipping (AVX2 path used)");
return;
}
let base_angles = [94, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 176];
let flag_sets = [0i32, 1 << 10, (1 << 10) | (1 << 9)];
let dims = [
(4, 4),
(4, 8),
(8, 4),
(8, 8),
(8, 16),
(16, 8),
(16, 16),
(16, 32),
(32, 16),
(32, 32),
(32, 64),
(64, 32),
(64, 64),
(4, 16),
(16, 4),
];
let mut compared = 0usize;
for &(w, h) in &dims {
for &ba in &base_angles {
for &fl in &flag_sets {
let angle = ba | fl;
let (mw, mh) = (w as i32, h as i32);
let ref_ok = std::panic::catch_unwind(|| {
let (tl, tl_off) = make_topleft();
let mut d = vec![7u8; 64 * 64];
let t3 = crate::src::cpu::summon_avx2().expect("avx2");
ipred_z2_8bpc_inner(t3, &mut d, 0, 64, &tl, tl_off, w, h, angle, mw, mh);
})
.is_ok();
if !ref_ok {
continue;
}
let (a, b) = run_z2(w, h, angle, mw, mh);
assert_block_eq(
&a,
&b,
w,
h,
64,
&format!("z2 w={w} h={h} angle={angle} mw={mw} mh={mh}"),
);
compared += 1;
}
}
}
eprintln!("z2_v4x compared {compared} configs");
assert!(
compared >= 100,
"z2 test compared too few configs: {compared}"
);
}
}