#![allow(deprecated)] #![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#![allow(unused)]
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use archmage::{Desktop64, Server64, SimdToken, arcane};
use std::ffi::c_int;
#[allow(non_camel_case_types)]
type ptrdiff_t = isize;
#[cfg(target_arch = "x86_64")]
use super::partial_simd;
#[cfg(target_arch = "x86_64")]
use crate::src::safe_simd::pixel_access::{
Flex, loadu_128, loadu_256, loadu_512, storeu_128, storeu_256, storeu_512,
};
use crate::include::common::bitdepth::DynPixel;
use crate::include::dav1d::picture::PicOffset;
use crate::src::ffi_safe::FFISafe;
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let fill_val = _mm256_set1_epi8(128u8 as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = 128;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_128_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
_topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
ipred_dc_128_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let fill_val = _mm512_set1_epi8(128u8 as i8);
let fill_256 = _mm256_set1_epi8(128u8 as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_val);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = 128;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let top_off = tl_off + 1;
match width {
4 => {
let top_val = _mm_cvtsi32_si128(i32::from_ne_bytes(
topleft[top_off..top_off + 4].try_into().unwrap(),
));
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
dst[row_off..row_off + 4]
.copy_from_slice(&_mm_cvtsi128_si32(top_val).to_ne_bytes());
}
}
8 => {
let top_val = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[top_off..top_off + 8]).try_into().unwrap(),
);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off..row_off + 8]).try_into().unwrap(),
top_val,
);
}
}
16 => {
let top_val = loadu_128!((&topleft[top_off..top_off + 16]), [u8; 16]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_128!((&mut dst[row_off..row_off + 16]), [u8; 16], top_val);
}
}
32 => {
let top_val = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val);
}
}
64 => {
let top_val = loadu_512!((&topleft[top_off..top_off + 64]), [u8; 64]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_512!((&mut dst[row_off..row_off + 64]), [u8; 64], top_val);
}
}
_ => {
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
dst[row_off..row_off + width].copy_from_slice(&topleft[top_off..top_off + width]);
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let top_off = tl_off + 1;
match width {
4 => {
let top_val = _mm_cvtsi32_si128(i32::from_ne_bytes(
topleft[top_off..top_off + 4].try_into().unwrap(),
));
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
dst[row_off..row_off + 4]
.copy_from_slice(&_mm_cvtsi128_si32(top_val).to_ne_bytes());
}
}
8 => {
let top_val = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[top_off..top_off + 8]).try_into().unwrap(),
);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off..row_off + 8]).try_into().unwrap(),
top_val,
);
}
}
16 => {
let top_val = loadu_128!((&topleft[top_off..top_off + 16]), [u8; 16]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_128!((&mut dst[row_off..row_off + 16]), [u8; 16], top_val);
}
}
32 => {
let top_val = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val);
}
}
64 => {
let top_val0 = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
let top_val1 = loadu_256!((&topleft[top_off + 32..top_off + 64]), [u8; 32]);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val0);
storeu_256!((&mut dst[row_off + 32..row_off + 64]), [u8; 32], top_val1);
}
}
_ => {
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
dst[row_off..row_off + width].copy_from_slice(&topleft[top_off..top_off + width]);
}
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_v_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_v_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let left_pixel = topleft[tl_off - y - 1];
let fill_val = _mm256_set1_epi8(left_pixel as i8);
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = left_pixel;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_h_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_h_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let left_pixel = topleft[tl_off - y - 1];
let fill_512 = _mm512_set1_epi8(left_pixel as i8);
let fill_256 = _mm256_set1_epi8(left_pixel as i8);
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = left_pixel;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for x in 0..width {
sum += topleft[tl_off + 1 + x] as u32;
}
for y in 0..height {
sum += topleft[tl_off - y - 1] as u32;
}
let total = width + height;
let dc_val = ((sum + (total as u32 >> 1)) / total as u32) as u8;
let fill_512 = _mm512_set1_epi8(dc_val as i8);
let fill_256 = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for x in 0..width {
sum += topleft[tl_off + 1 + x] as u32;
}
let dc_val = ((sum + (width as u32 >> 1)) / width as u32) as u8;
let fill_512 = _mm512_set1_epi8(dc_val as i8);
let fill_256 = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for y in 0..height {
sum += topleft[tl_off - y - 1] as u32;
}
let dc_val = ((sum + (height as u32 >> 1)) / height as u32) as u8;
let fill_512 = _mm512_set1_epi8(dc_val as i8);
let fill_256 = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 64 <= width {
storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
x += 64;
}
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for x in 0..width {
sum += topleft[tl_off + 1 + x] as u32;
}
for y in 0..height {
sum += topleft[tl_off - y - 1] as u32;
}
let total = width + height;
let dc_val = ((sum + (total as u32 >> 1)) / total as u32) as u8;
let fill_val = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_dc_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for x in 0..width {
sum += topleft[tl_off + 1 + x] as u32;
}
let dc_val = ((sum + (width as u32 >> 1)) / width as u32) as u8;
let fill_val = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_top_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_dc_top_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum: u32 = 0;
for y in 0..height {
sum += topleft[tl_off - y - 1] as u32;
}
let dc_val = ((sum + (height as u32 >> 1)) / height as u32) as u8;
let fill_val = _mm256_set1_epi8(dc_val as i8);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let row = &mut dst[row_off..][..width];
let mut x = 0;
while x + 32 <= width {
storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
x += 32;
}
while x + 16 <= width {
storeu_128!(
&mut row[x..x + 16],
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 16;
}
while x < width {
row[x] = dc_val;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_left_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_dc_left_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let topleft_val = topleft[tl_off] as i32;
let topleft_vec = _mm512_set1_epi32(topleft_val);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm512_set1_epi32(left_val);
let mut x = 0;
while x + 16 <= width {
let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
let top = _mm512_cvtepu8_epi32(top_bytes);
let base = _mm512_sub_epi32(_mm512_add_epi32(left_vec, top), topleft_vec);
let ldiff = _mm512_abs_epi32(_mm512_sub_epi32(left_vec, base));
let tdiff = _mm512_abs_epi32(_mm512_sub_epi32(top, base));
let tldiff = _mm512_abs_epi32(_mm512_sub_epi32(topleft_vec, base));
let ld_le_td = !_mm512_cmpgt_epi32_mask(ldiff, tdiff);
let ld_le_tld = !_mm512_cmpgt_epi32_mask(ldiff, tldiff);
let td_le_tld = !_mm512_cmpgt_epi32_mask(tdiff, tldiff);
let use_left = ld_le_td & ld_le_tld;
let use_top = !use_left & td_le_tld;
let result = _mm512_mask_blend_epi32(
use_left,
_mm512_mask_blend_epi32(use_top, topleft_vec, top),
left_vec,
);
let clamped = _mm512_max_epi32(result, _mm512_setzero_si512());
let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);
x += 16;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let base = left_val + top_val - topleft_val;
let ldiff = (left_val - base).abs();
let tdiff = (top_val - base).abs();
let tldiff = (topleft_val - base).abs();
let result = if ldiff <= tdiff && ldiff <= tldiff {
left_val
} else if tdiff <= tldiff {
top_val
} else {
topleft_val
};
row[x] = result as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let weights_ver = &dav1d_sm_weights[height..][..height];
let right_val = topleft[tl_off + width] as i32;
let bottom_val = topleft[tl_off - height] as i32;
let right_vec = _mm512_set1_epi32(right_val);
let bottom_vec = _mm512_set1_epi32(bottom_val);
let rounding = _mm512_set1_epi32(256);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm512_set1_epi32(left_val);
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm512_set1_epi32(w_v);
let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 16 <= width {
let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
let top = _mm512_cvtepu8_epi32(top_bytes);
let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
let w_h = _mm512_cvtepu8_epi32(wh_bytes);
let w_h_inv = _mm512_sub_epi32(c256, w_h);
let vert = _mm512_add_epi32(
_mm512_mullo_epi32(w_v_vec, top),
_mm512_mullo_epi32(w_v_inv, bottom_vec),
);
let hor = _mm512_add_epi32(
_mm512_mullo_epi32(w_h, left_vec),
_mm512_mullo_epi32(w_h_inv, right_vec),
);
let pred = _mm512_add_epi32(vert, hor);
let result = _mm512_srai_epi32::<9>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);
x += 16;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let w_h = weights_hor[x] as i32;
let pred =
w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
row[x] = ((pred + 256) >> 9) as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_ver = &dav1d_sm_weights[height..][..height];
let bottom_val = topleft[tl_off - height] as i32;
let bottom_vec = _mm512_set1_epi32(bottom_val);
let rounding = _mm512_set1_epi32(128);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm512_set1_epi32(w_v);
let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 16 <= width {
let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
let top = _mm512_cvtepu8_epi32(top_bytes);
let pred = _mm512_add_epi32(
_mm512_mullo_epi32(w_v_vec, top),
_mm512_mullo_epi32(w_v_inv, bottom_vec),
);
let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);
x += 16;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let pred = w_v * top_val + (256 - w_v) * bottom_val;
row[x] = ((pred + 128) >> 8) as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_8bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let right_val = topleft[tl_off + width] as i32;
let right_vec = _mm512_set1_epi32(right_val);
let rounding = _mm512_set1_epi32(128);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm512_set1_epi32(left_val);
let mut x = 0;
while x + 16 <= width {
let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
let w_h = _mm512_cvtepu8_epi32(wh_bytes);
let w_h_inv = _mm512_sub_epi32(c256, w_h);
let pred = _mm512_add_epi32(
_mm512_mullo_epi32(w_h, left_vec),
_mm512_mullo_epi32(w_h_inv, right_vec),
);
let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);
x += 16;
}
let row = &mut dst[row_off..][..width];
while x < width {
let w_h = weights_hor[x] as i32;
let pred = w_h * left_val + (256 - w_h) * right_val;
row[x] = ((pred + 128) >> 8) as u8;
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let topleft_val = topleft[tl_off] as i32;
let topleft_vec = _mm256_set1_epi32(topleft_val);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm256_set1_epi32(left_val);
let mut x = 0;
while x + 8 <= width {
let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
.try_into()
.unwrap(),
);
let top_lo = _mm256_cvtepu8_epi32(top_bytes);
let base = _mm256_sub_epi32(_mm256_add_epi32(left_vec, top_lo), topleft_vec);
let ldiff = _mm256_abs_epi32(_mm256_sub_epi32(left_vec, base));
let tdiff = _mm256_abs_epi32(_mm256_sub_epi32(top_lo, base));
let tldiff = _mm256_abs_epi32(_mm256_sub_epi32(topleft_vec, base));
let ld_le_td = _mm256_or_si256(
_mm256_cmpgt_epi32(tdiff, ldiff),
_mm256_cmpeq_epi32(ldiff, tdiff),
);
let ld_le_tld = _mm256_or_si256(
_mm256_cmpgt_epi32(tldiff, ldiff),
_mm256_cmpeq_epi32(ldiff, tldiff),
);
let td_le_tld = _mm256_or_si256(
_mm256_cmpgt_epi32(tldiff, tdiff),
_mm256_cmpeq_epi32(tdiff, tldiff),
);
let use_left = _mm256_and_si256(ld_le_td, ld_le_tld);
let use_top = _mm256_andnot_si256(use_left, td_le_tld);
let result = _mm256_blendv_epi8(
_mm256_blendv_epi8(topleft_vec, top_lo, use_top),
left_vec,
use_left,
);
let packed = _mm256_shuffle_epi8(
result,
_mm256_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
),
);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi32(lo, hi);
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
combined,
);
x += 8;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let base = left_val + top_val - topleft_val;
let ldiff = (left_val - base).abs();
let tdiff = (top_val - base).abs();
let tldiff = (topleft_val - base).abs();
let result = if ldiff <= tdiff && ldiff <= tldiff {
left_val
} else if tdiff <= tldiff {
top_val
} else {
topleft_val
};
row[x] = result as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_paeth_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_paeth_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
use crate::src::tables::dav1d_sm_weights;
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let weights_ver = &dav1d_sm_weights[height..][..height];
let right_val = topleft[tl_off + width] as i32;
let bottom_val = topleft[tl_off - height] as i32;
let right_vec = _mm256_set1_epi32(right_val);
let bottom_vec = _mm256_set1_epi32(bottom_val);
let rounding = _mm256_set1_epi32(256);
let c256 = _mm256_set1_epi32(256);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm256_set1_epi32(left_val);
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm256_set1_epi32(w_v);
let w_v_inv = _mm256_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 8 <= width {
let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
.try_into()
.unwrap(),
);
let top = _mm256_cvtepu8_epi32(top_bytes);
let w_h_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&weights_hor[x..x + 8]).try_into().unwrap(),
);
let w_h = _mm256_cvtepu8_epi32(w_h_bytes);
let w_h_inv = _mm256_sub_epi32(c256, w_h);
let vert = _mm256_add_epi32(
_mm256_mullo_epi32(w_v_vec, top),
_mm256_mullo_epi32(w_v_inv, bottom_vec),
);
let hor = _mm256_add_epi32(
_mm256_mullo_epi32(w_h, left_vec),
_mm256_mullo_epi32(w_h_inv, right_vec),
);
let pred = _mm256_add_epi32(vert, hor);
let result = _mm256_srai_epi32::<9>(_mm256_add_epi32(pred, rounding));
let packed = _mm256_shuffle_epi8(
result,
_mm256_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
),
);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi32(lo, hi);
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
combined,
);
x += 8;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let w_h = weights_hor[x] as i32;
let pred =
w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
row[x] = ((pred + 256) >> 9) as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_smooth_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_ver = &dav1d_sm_weights[height..][..height];
let bottom_val = topleft[tl_off - height] as i32;
let bottom_vec = _mm256_set1_epi32(bottom_val);
let rounding = _mm256_set1_epi32(128);
let c256 = _mm256_set1_epi32(256);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm256_set1_epi32(w_v);
let w_v_inv = _mm256_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 8 <= width {
let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
.try_into()
.unwrap(),
);
let top = _mm256_cvtepu8_epi32(top_bytes);
let pred = _mm256_add_epi32(
_mm256_mullo_epi32(w_v_vec, top),
_mm256_mullo_epi32(w_v_inv, bottom_vec),
);
let result = _mm256_srai_epi32::<8>(_mm256_add_epi32(pred, rounding));
let packed = _mm256_shuffle_epi8(
result,
_mm256_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
),
);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi32(lo, hi);
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
combined,
);
x += 8;
}
let row = &mut dst[row_off..][..width];
while x < width {
let top_val = topleft[tl_off + 1 + x] as i32;
let pred = w_v * top_val + (256 - w_v) * bottom_val;
row[x] = ((pred + 128) >> 8) as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_v_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_smooth_v_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let right_val = topleft[tl_off + width] as i32;
let right_vec = _mm256_set1_epi32(right_val);
let rounding = _mm256_set1_epi32(128);
let c256 = _mm256_set1_epi32(256);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_val = topleft[tl_off - y - 1] as i32;
let left_vec = _mm256_set1_epi32(left_val);
let mut x = 0;
while x + 8 <= width {
let w_h_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
(&weights_hor[x..x + 8]).try_into().unwrap(),
);
let w_h = _mm256_cvtepu8_epi32(w_h_bytes);
let w_h_inv = _mm256_sub_epi32(c256, w_h);
let pred = _mm256_add_epi32(
_mm256_mullo_epi32(w_h, left_vec),
_mm256_mullo_epi32(w_h_inv, right_vec),
);
let result = _mm256_srai_epi32::<8>(_mm256_add_epi32(pred, rounding));
let packed = _mm256_shuffle_epi8(
result,
_mm256_setr_epi8(
0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
),
);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi32(lo, hi);
partial_simd::mm_storel_epi64::<[u8; 8]>(
(&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
combined,
);
x += 8;
}
let row = &mut dst[row_off..][..width];
while x < width {
let w_h = weights_hor[x] as i32;
let pred = w_h * left_val + (256 - w_h) * right_val;
row[x] = ((pred + 128) >> 8) as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_h_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_smooth_h_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
use crate::src::tables::{FLT_INCR, dav1d_dr_intra_derivative, dav1d_filter_intra_taps, filter_fn};
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_filter_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
filt_idx: i32,
topleft_off: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let width = (width / 4) * 4; let filt_idx = (filt_idx as usize) & 511;
let filter = &dav1d_filter_intra_taps[filt_idx];
for y in (0..height).step_by(2) {
let cur_tl_off = topleft_off - y;
let mut tl_pixel = topleft[tl_off.wrapping_add(cur_tl_off)] as i32;
let row0_off = (dst_base as isize + y as isize * stride) as usize;
let row1_off = (dst_base as isize + (y + 1) as isize * stride) as usize;
for x in (0..width).step_by(4) {
let (p1, p2, p3, p4) = if y == 0 {
let top_base = tl_off.wrapping_add(topleft_off + 1 + x);
(
topleft[top_base] as i32,
topleft[top_base + 1] as i32,
topleft[top_base + 2] as i32,
topleft[top_base + 3] as i32,
)
} else {
let top_row = (dst_base as isize + (y as isize - 1) * stride) as usize;
(
dst[top_row + x] as i32,
dst[top_row + x + 1] as i32,
dst[top_row + x + 2] as i32,
dst[top_row + x + 3] as i32,
)
};
let (p5, p6) = if x == 0 {
let left_base = tl_off.wrapping_add(cur_tl_off.wrapping_sub(1));
(
topleft[left_base] as i32,
topleft[left_base.wrapping_sub(1)] as i32,
)
} else {
(dst[row0_off + x - 1] as i32, dst[row1_off + x - 1] as i32)
};
let p0 = tl_pixel;
let p = [p0, p1, p2, p3, p4, p5, p6];
let flt = filter.as_slice();
let mut flt_offset = 0;
for xx in 0..4 {
let acc = filter_fn(&flt[flt_offset..], p);
let val = ((acc + 8) >> 4).clamp(0, 255) as u8;
dst[row0_off + x + xx] = val;
flt_offset += FLT_INCR;
}
for xx in 0..4 {
let acc = filter_fn(&flt[flt_offset..], p);
let val = ((acc + 8) >> 4).clamp(0, 255) as u8;
dst[row1_off + x + xx] = val;
flt_offset += FLT_INCR;
}
tl_pixel = p4;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_filter_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
filt_idx: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_filter_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
filt_idx as i32,
topleft_off,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z1_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dx = dav1d_dr_intra_derivative[(angle >> 1) as usize] as i32;
let upsample_above = enable_intra_edge_filter
&& (90 - angle) < 40
&& ((width_i + height_i) as usize) <= (16 >> is_sm as usize);
let mut top_out = [0u8; 64 + 64];
let (top, max_base_x, base_inc);
if upsample_above {
upsample_edge_8bpc(
&mut top_out,
width_i + height_i,
topleft,
tl_off + 1,
-1,
width_i + std::cmp::min(width_i, height_i),
);
dx <<= 1;
top = top_out.as_slice();
max_base_x = (2 * (width_i + height_i) - 2) as usize;
base_inc = 2usize;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 90 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut top_out,
width_i + height_i,
0,
width_i + height_i,
topleft,
tl_off + 1,
-1,
width_i + std::cmp::min(width_i, height_i),
filter_strength,
);
top = top_out.as_slice();
max_base_x = (width_i + height_i - 1) as usize;
} else {
top = &topleft[tl_off + 1..];
max_base_x = width + std::cmp::min(width, height) - 1;
}
base_inc = 1;
};
let top = top.flex();
let rounding = _mm256_set1_epi16(32);
for y in 0..height_i {
let xpos = (y + 1) * dx;
let frac = (xpos & 0x3e) as i16;
let inv_frac = (64 - frac) as i16;
let frac_vec = _mm256_set1_epi16(frac);
let inv_frac_vec = _mm256_set1_epi16(inv_frac);
let row_off = (dst_base as isize + y as isize * stride) as usize;
let base0 = (xpos >> 6) as usize;
let mut x = 0usize;
if base_inc == 1 {
while x + 16 <= width && base0 + x + 16 < max_base_x {
let base = base0 + x;
let t0 = loadu_128!((&top[base..base + 16]), [u8; 16]);
let t1 = loadu_128!((&top[base + 1..base + 17]), [u8; 16]);
let t0_w = _mm256_cvtepu8_epi16(t0);
let t1_w = _mm256_cvtepu8_epi16(t1);
let prod0 = _mm256_mullo_epi16(t0_w, inv_frac_vec);
let prod1 = _mm256_mullo_epi16(t1_w, frac_vec);
let sum = _mm256_add_epi16(_mm256_add_epi16(prod0, prod1), rounding);
let result = _mm256_srai_epi16::<6>(sum);
let packed = _mm256_packus_epi16(result, result);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi64(lo, hi);
storeu_128!(
(&mut dst[row_off + x..row_off + x + 16]),
[u8; 16],
combined
);
x += 16;
}
}
while x < width {
let base = base0 + base_inc * x;
if base < max_base_x {
let t0 = top[base] as i32;
let t1 = top[base + 1] as i32;
let v = t0 * inv_frac as i32 + t1 * frac as i32;
dst[row_off + x] = ((v + 32) >> 6) as u8;
} else {
let fill_val = top[max_base_x];
for xx in x..width {
dst[row_off + xx] = fill_val;
}
break;
}
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z1_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_z1_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
);
}
#[inline]
fn get_filter_strength_simple(wh: i32, angle: i32, is_sm: bool) -> i32 {
if is_sm {
match (wh, angle) {
(..=8, 64..) => 2,
(..=8, 40..) => 1,
(..=8, ..) => 0,
(..=16, 48..) => 2,
(..=16, 20..) => 1,
(..=16, ..) => 0,
(..=24, 4..) => 3,
(..=24, ..) => 0,
(..) => 3,
}
} else {
match (wh, angle) {
(..=8, 56..) => 1,
(..=8, ..) => 0,
(..=16, 40..) => 1,
(..=16, ..) => 0,
(..=24, 32..) => 3,
(..=24, 16..) => 2,
(..=24, 8..) => 1,
(..=24, ..) => 0,
(..=32, 32..) => 3,
(..=32, 4..) => 2,
(..=32, ..) => 1,
(..) => 3,
}
}
}
fn filter_edge_8bpc(
out: &mut [u8],
sz: i32,
lim_from: i32,
lim_to: i32,
inp: &[u8],
in_off: usize,
from: i32,
to: i32,
strength: i32,
) {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let mut i = 0;
while i < std::cmp::min(sz, lim_from) {
out[i as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
i += 1;
}
while i < std::cmp::min(lim_to, sz) {
let mut s = 0i32;
for j in 0..5i32 {
s += inp[in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize)] as i32
* KERNEL[(strength - 1) as usize][j as usize] as i32;
}
out[i as usize] = ((s + 8) >> 4) as u8;
i += 1;
}
while i < sz {
out[i as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
i += 1;
}
}
fn upsample_edge_8bpc(out: &mut [u8], hsz: i32, inp: &[u8], in_off: usize, from: i32, to: i32) {
let kernel: [i8; 4] = [-1, 9, 9, -1];
for i in 0..hsz - 1 {
out[(i * 2) as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
let mut s = 0i32;
for j in 0..4i32 {
s += inp[in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize)] as i32
* kernel[j as usize] as i32;
}
out[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, 255) as u8;
}
let i = hsz - 1;
out[(i * 2) as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z2_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
max_width: i32,
max_height: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((angle - 90) >> 1) as usize] as i32;
let mut dx = dav1d_dr_intra_derivative[((180 - angle) >> 1) as usize] as i32;
let upsample_left = enable_intra_edge_filter
&& (180 - angle) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let upsample_above = enable_intra_edge_filter
&& (angle - 90) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut edge = [0u8; 64 + 64 + 1];
let edge_tl = 64usize;
if upsample_above {
upsample_edge_8bpc(
&mut edge[edge_tl..],
width_i + 1,
topleft,
tl_off,
0,
width_i + 1,
);
dx <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 90, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut edge[edge_tl + 1..],
width_i,
0,
max_width,
topleft,
tl_off + 1,
-1,
width_i,
filter_strength,
);
} else {
edge[edge_tl + 1..edge_tl + 1 + width]
.copy_from_slice(&topleft[tl_off + 1..tl_off + 1 + width]);
}
}
if upsample_left {
upsample_edge_8bpc(
&mut edge[edge_tl - height * 2..],
height_i + 1,
topleft,
tl_off.wrapping_sub(height),
0,
height_i + 1,
);
dy <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 180 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut edge[edge_tl - height..],
height_i,
height_i - max_height,
height_i,
topleft,
tl_off.wrapping_sub(height),
0,
height_i + 1,
filter_strength,
);
} else {
edge[edge_tl - height..edge_tl].copy_from_slice(&topleft[tl_off - height..tl_off]);
}
}
edge[edge_tl] = topleft[tl_off];
let edge = edge.as_slice().flex();
let base_inc_x = 1 + upsample_above as usize;
let left = edge_tl - (1 + upsample_left as usize);
let rounding = _mm256_set1_epi16(32);
for y in 0..height_i {
let xpos = ((1 + upsample_above as i32) << 6) - dx * (y + 1);
let base_x0 = xpos >> 6;
let frac_x = (xpos & 0x3e) as i16;
let inv_frac_x = (64 - frac_x) as i16;
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_count = if base_x0 >= 0 {
0usize
} else {
let needed = (-base_x0) as usize;
needed.div_ceil(base_inc_x).min(width)
};
let mut x = 0usize;
while x < left_count {
let ypos = (y << (6 + upsample_left as i32)) - dy * (x as i32 + 1);
let base_y = ypos >> 6;
let frac_y = ypos & 0x3e;
let inv_frac_y = 64 - frac_y;
let l0_idx = left.wrapping_add_signed(-base_y as isize);
let l1_idx = left.wrapping_add_signed(-(base_y + 1) as isize);
let l0 = edge[l0_idx] as i32;
let l1 = edge[l1_idx] as i32;
let v = l0 * inv_frac_y + l1 * frac_y;
dst[row_off + x] = ((v + 32) >> 6) as u8;
x += 1;
}
if base_inc_x == 1 {
while x + 16 <= width {
let base_x = (base_x0 + x as i32) as usize;
let idx = edge_tl + base_x;
if idx + 17 > edge.len() {
break;
}
let t0 = loadu_128!((&edge[idx..idx + 16]), [u8; 16]);
let t1 = loadu_128!((&edge[idx + 1..idx + 17]), [u8; 16]);
let t0_w = _mm256_cvtepu8_epi16(t0);
let t1_w = _mm256_cvtepu8_epi16(t1);
let frac_vec = _mm256_set1_epi16(frac_x);
let inv_frac_vec = _mm256_set1_epi16(inv_frac_x);
let prod0 = _mm256_mullo_epi16(t0_w, inv_frac_vec);
let prod1 = _mm256_mullo_epi16(t1_w, frac_vec);
let sum = _mm256_add_epi16(_mm256_add_epi16(prod0, prod1), rounding);
let result = _mm256_srai_epi16::<6>(sum);
let packed = _mm256_packus_epi16(result, result);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi64(lo, hi);
storeu_128!(
(&mut dst[row_off + x..row_off + x + 16]),
[u8; 16],
combined
);
x += 16;
}
}
while x < width {
let base_x = (base_x0 + (base_inc_x * x) as i32) as usize;
let idx = edge_tl + base_x;
if idx + 2 > edge.len() {
break;
}
let t0 = edge[idx] as i32;
let t1 = edge[idx + 1] as i32;
let v = t0 * inv_frac_x as i32 + t1 * frac_x as i32;
dst[row_off + x] = ((v + 32) >> 6) as u8;
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z2_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
max_width: c_int,
max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_z2_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
max_width as i32,
max_height as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z3_8bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((270 - angle) >> 1) as usize] as usize;
let upsample_left = enable_intra_edge_filter
&& (angle - 180) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut left_out = [0u8; 64 + 64];
let (left, left_off, max_base_y, base_inc);
if upsample_left {
upsample_edge_8bpc(
&mut left_out,
width_i + height_i,
topleft,
tl_off - (width + height),
std::cmp::max(width_i - height_i, 0),
width_i + height_i + 1,
);
left_off = (2 * (width_i + height_i) - 2) as usize;
max_base_y = left_off;
dy <<= 1;
base_inc = 2usize;
left = left_out.as_slice();
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 180, is_sm)
} else {
0
};
if filter_strength != 0 {
filter_edge_8bpc(
&mut left_out,
width_i + height_i,
0,
width_i + height_i,
topleft,
tl_off - (width + height),
std::cmp::max(width_i - height_i, 0),
width_i + height_i + 1,
filter_strength,
);
left_off = (width_i + height_i - 1) as usize;
max_base_y = left_off;
left = left_out.as_slice();
} else {
left = topleft;
left_off = tl_off - 1;
max_base_y = height + std::cmp::min(width, height) - 1;
}
base_inc = 1;
};
let left = left.flex();
for x in 0..width {
let ypos = dy * (x + 1);
let frac = (ypos & 0x3e) as i32;
let inv_frac = 64 - frac;
for y in 0..height_i {
let base = (ypos >> 6) + base_inc * y as usize;
if base < max_base_y {
let l0 = left[left_off - base] as i32;
let l1 = left[left_off - base - 1] as i32;
let v = l0 * inv_frac + l1 * frac;
let pixel_off = (dst_base as isize + y as isize * stride) as usize + x;
dst[pixel_off] = ((v + 32) >> 6) as u8;
} else {
let fill_val = left[left_off - max_base_y];
for yy in y..height_i {
let pixel_off = (dst_base as isize + yy as isize * stride) as usize + x;
dst[pixel_off] = fill_val;
}
break;
}
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z3_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) =
compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
ipred_z3_8bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
);
}
#[cfg(target_arch = "x86_64")]
fn compute_ipred_buf_len(stride: isize, width: usize, height: usize) -> usize {
height.saturating_sub(1) * stride.unsigned_abs() + width
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
unsafe fn compute_topleft_slice<'a>(
tl_ptr: *const u8,
width: usize,
height: usize,
) -> (&'a [u8], usize) {
let neg_reach = height + 2;
let pos_reach = width + height + 2;
let total = neg_reach + pos_reach;
let base = unsafe { tl_ptr.sub(neg_reach) };
(
unsafe { std::slice::from_raw_parts(base, total) },
neg_reach,
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
width: usize,
height: usize,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mid_val = ((bitdepth_max + 1) / 2) as u16;
let fill_val = _mm256_set1_epi16(mid_val as i16);
let width_bytes = width * 2;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&mid_val.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_128_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
_topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
ipred_dc_128_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
width as usize,
height as usize,
bitdepth_max as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let top_off = tl_off + 2;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_256!((&topleft[load_off..load_off + 32]), [u8; 32]);
let store_off = row_off + x * 2;
storeu_256!((&mut dst[store_off..store_off + 32]), [u8; 32], top_vals);
x += 16;
}
while x + 8 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_128!((&topleft[load_off..load_off + 16]), [u8; 16]);
let store_off = row_off + x * 2;
storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], top_vals);
x += 8;
}
while x < width {
let load_off = top_off + x * 2;
let store_off = row_off + x * 2;
dst[store_off..store_off + 2].copy_from_slice(&topleft[load_off..load_off + 2]);
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_v_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_v_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
);
let fill_val = _mm256_set1_epi16(left_val as i16);
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&left_val.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_h_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_h_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
width: usize,
height: usize,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let mid_val = ((bitdepth_max + 1) / 2) as u16;
let fill_512 = _mm512_set1_epi16(mid_val as i16);
let fill_256 = _mm256_set1_epi16(mid_val as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&mid_val.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let top_off = tl_off + 2;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_512!((&topleft[load_off..load_off + 64]), [u8; 64]);
let store_off = row_off + x * 2;
storeu_512!((&mut dst[store_off..store_off + 64]), [u8; 64], top_vals);
x += 32;
}
while x + 16 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_256!((&topleft[load_off..load_off + 32]), [u8; 32]);
let store_off = row_off + x * 2;
storeu_256!((&mut dst[store_off..store_off + 32]), [u8; 32], top_vals);
x += 16;
}
while x + 8 <= width {
let load_off = top_off + x * 2;
let top_vals = loadu_128!((&topleft[load_off..load_off + 16]), [u8; 16]);
let store_off = row_off + x * 2;
storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], top_vals);
x += 8;
}
while x < width {
let load_off = top_off + x * 2;
let store_off = row_off + x * 2;
dst[store_off..store_off + 2].copy_from_slice(&topleft[load_off..load_off + 2]);
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
);
let fill_512 = _mm512_set1_epi16(left_val as i16);
let fill_256 = _mm256_set1_epi16(left_val as i16);
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&left_val.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=width {
let off = tl_off + i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
for i in 1..=height {
let off = tl_off - i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let count = (width + height) as u32;
let avg = ((sum + count / 2) / count) as u16;
let fill_512 = _mm512_set1_epi16(avg as i16);
let fill_256 = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=width {
let off = tl_off + i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let avg = ((sum + width as u32 / 2) / width as u32) as u16;
let fill_512 = _mm512_set1_epi16(avg as i16);
let fill_256 = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=height {
let off = tl_off - i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let avg = ((sum + height as u32 / 2) / height as u32) as u16;
let fill_512 = _mm512_set1_epi16(avg as i16);
let fill_256 = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 32 <= width {
let off = row_off + x * 2;
storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
x += 32;
}
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_256)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=width {
let off = tl_off + i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
for i in 1..=height {
let off = tl_off - i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let count = (width + height) as u32;
let avg = ((sum + count / 2) / count) as u16;
let fill_val = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_dc_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=width {
let off = tl_off + i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let avg = ((sum + width as u32 / 2) / width as u32) as u16;
let fill_val = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_top_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_dc_top_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let mut sum = 0u32;
for i in 1..=height {
let off = tl_off - i * 2;
sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
}
let avg = ((sum + height as u32 / 2) / height as u32) as u16;
let fill_val = _mm256_set1_epi16(avg as i16);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let mut x = 0usize;
while x + 16 <= width {
let off = row_off + x * 2;
storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
x += 16;
}
while x + 8 <= width {
let off = row_off + x * 2;
storeu_128!(
(&mut dst[off..off + 16]),
[u8; 16],
_mm256_castsi256_si128(fill_val)
);
x += 8;
}
while x < width {
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_left_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_dc_left_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let topleft_val = u16::from_ne_bytes(topleft[tl_off..tl_off + 2].try_into().unwrap()) as i32;
let topleft_vec = _mm512_set1_epi32(topleft_val);
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
let left_vec = _mm512_set1_epi32(left_val);
let mut x = 0;
while x + 16 <= width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
let top = _mm512_cvtepu16_epi32(top_u16);
let base = _mm512_sub_epi32(_mm512_add_epi32(left_vec, top), topleft_vec);
let ldiff = _mm512_abs_epi32(_mm512_sub_epi32(left_vec, base));
let tdiff = _mm512_abs_epi32(_mm512_sub_epi32(top, base));
let tldiff = _mm512_abs_epi32(_mm512_sub_epi32(topleft_vec, base));
let ld_le_td = !_mm512_cmpgt_epi32_mask(ldiff, tdiff);
let ld_le_tld = !_mm512_cmpgt_epi32_mask(ldiff, tldiff);
let td_le_tld = !_mm512_cmpgt_epi32_mask(tdiff, tldiff);
let use_left = ld_le_td & ld_le_tld;
let use_top = !use_left & td_le_tld;
let result = _mm512_mask_blend_epi32(
use_left,
_mm512_mask_blend_epi32(use_top, topleft_vec, top),
left_vec,
);
let clamped = _mm512_max_epi32(result, _mm512_setzero_si512());
let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
let off = row_off + x * 2;
storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);
x += 16;
}
while x < width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let base = left_val + top_val - topleft_val;
let l_diff = (left_val - base).abs();
let t_diff = (top_val - base).abs();
let tl_diff = (topleft_val - base).abs();
let pred = if l_diff <= t_diff && l_diff <= tl_diff {
left_val
} else if t_diff <= tl_diff {
top_val
} else {
topleft_val
};
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let weights_ver = &dav1d_sm_weights[height..][..height];
let right_off = tl_off + width * 2;
let right_val =
u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
let bottom_off = tl_off - height * 2;
let bottom_val =
u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
let right_vec = _mm512_set1_epi32(right_val);
let bottom_vec = _mm512_set1_epi32(bottom_val);
let rounding = _mm512_set1_epi32(256);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
let left_vec = _mm512_set1_epi32(left_val);
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm512_set1_epi32(w_v);
let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 16 <= width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
let top = _mm512_cvtepu16_epi32(top_u16);
let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
let w_h = _mm512_cvtepu8_epi32(wh_bytes);
let w_h_inv = _mm512_sub_epi32(c256, w_h);
let vert = _mm512_add_epi32(
_mm512_mullo_epi32(w_v_vec, top),
_mm512_mullo_epi32(w_v_inv, bottom_vec),
);
let hor = _mm512_add_epi32(
_mm512_mullo_epi32(w_h, left_vec),
_mm512_mullo_epi32(w_h_inv, right_vec),
);
let pred = _mm512_add_epi32(vert, hor);
let result = _mm512_srai_epi32::<9>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
let off = row_off + x * 2;
storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);
x += 16;
}
while x < width {
let top_byte_off = tl_off + (1 + x) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let w_h = weights_hor[x] as i32;
let pred =
w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(((pred + 256) >> 9) as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_ver = &dav1d_sm_weights[height..][..height];
let bottom_off = tl_off - height * 2;
let bottom_val =
u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
let bottom_vec = _mm512_set1_epi32(bottom_val);
let rounding = _mm512_set1_epi32(128);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let w_v = weights_ver[y] as i32;
let w_v_vec = _mm512_set1_epi32(w_v);
let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);
let mut x = 0;
while x + 16 <= width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
let top = _mm512_cvtepu16_epi32(top_u16);
let pred = _mm512_add_epi32(
_mm512_mullo_epi32(w_v_vec, top),
_mm512_mullo_epi32(w_v_inv, bottom_vec),
);
let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
let off = row_off + x * 2;
storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);
x += 16;
}
while x < width {
let top_byte_off = tl_off + (1 + x) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let pred = (w_v * top_val + (256 - w_v) * bottom_val + 128) >> 8;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_16bpc_avx512_inner(
_token: Server64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let right_off = tl_off + width * 2;
let right_val =
u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
let right_vec = _mm512_set1_epi32(right_val);
let rounding = _mm512_set1_epi32(128);
let c256 = _mm512_set1_epi32(256);
let zero_512 = _mm512_setzero_si512();
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
let left_vec = _mm512_set1_epi32(left_val);
let mut x = 0;
while x + 16 <= width {
let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
let w_h = _mm512_cvtepu8_epi32(wh_bytes);
let w_h_inv = _mm512_sub_epi32(c256, w_h);
let pred = _mm512_add_epi32(
_mm512_mullo_epi32(w_h, left_vec),
_mm512_mullo_epi32(w_h_inv, right_vec),
);
let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));
let clamped = _mm512_max_epi32(result, zero_512);
let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
let off = row_off + x * 2;
storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);
x += 16;
}
while x < width {
let w_h = weights_hor[x] as i32;
let pred = (w_h * left_val + (256 - w_h) * right_val + 128) >> 8;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let topleft_val = u16::from_ne_bytes(topleft[tl_off..tl_off + 2].try_into().unwrap()) as i32;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
for x in 0..width {
let top_byte_off = tl_off + (x + 1) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let base = left_val + top_val - topleft_val;
let l_diff = (left_val - base).abs();
let t_diff = (top_val - base).abs();
let tl_diff = (topleft_val - base).abs();
let pred = if l_diff <= t_diff && l_diff <= tl_diff {
left_val
} else if t_diff <= tl_diff {
top_val
} else {
topleft_val
};
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_paeth_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_paeth_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let weights_ver = &dav1d_sm_weights[height..][..height];
let right_off = tl_off + width * 2;
let right_val =
u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
let bottom_off = tl_off - height * 2;
let bottom_val =
u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
let w_v = weights_ver[y] as i32;
for x in 0..width {
let top_byte_off = tl_off + (1 + x) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let w_h = weights_hor[x] as i32;
let vert = w_v * top_val + (256 - w_v) * bottom_val;
let horz = w_h * left_val + (256 - w_h) * right_val;
let pred = (vert + horz + 256) >> 9;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_smooth_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_ver = &dav1d_sm_weights[height..][..height];
let bottom_off = tl_off - height * 2;
let bottom_val =
u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let w_v = weights_ver[y] as i32;
for x in 0..width {
let top_byte_off = tl_off + (1 + x) * 2;
let top_val =
u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
as i32;
let pred = (w_v * top_val + (256 - w_v) * bottom_val + 128) >> 8;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_v_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_smooth_v_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let weights_hor = &dav1d_sm_weights[width..][..width];
let right_off = tl_off + width * 2;
let right_val =
u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
for y in 0..height {
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_byte_off = tl_off - (y + 1) * 2;
let left_val = u16::from_ne_bytes(
topleft[left_byte_off..left_byte_off + 2]
.try_into()
.unwrap(),
) as i32;
for x in 0..width {
let w_h = weights_hor[x] as i32;
let pred = (w_h * left_val + (256 - w_h) * right_val + 128) >> 8;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_h_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
_angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_smooth_h_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z1_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dx = dav1d_dr_intra_derivative[(angle >> 1) as usize] as i32;
let tl_pix = tl_off / 2;
let rd = |off: usize| -> u16 {
let b = off * 2;
u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
};
let upsample_above = enable_intra_edge_filter
&& (90 - angle) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut top_px = [0u16; 64 + 64];
let (max_base_x, base_inc);
if upsample_above {
let kernel: [i8; 4] = [-1, 9, 9, -1];
let hsz = width_i + height_i;
let in_off = tl_pix + 1;
let from = -1i32;
let to = width_i + std::cmp::min(width_i, height_i);
for i in 0..hsz - 1 {
top_px[(i * 2) as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
let mut s = 0i32;
for j in 0..4i32 {
s += rd(in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize))
as i32
* kernel[j as usize] as i32;
}
top_px[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
let i = hsz - 1;
top_px[(i * 2) as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
dx <<= 1;
max_base_x = (2 * (width_i + height_i) - 2) as usize;
base_inc = 2usize;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 90 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let in_off = tl_pix + 1;
let from = -1i32;
let to = width_i + std::cmp::min(width_i, height_i);
let lim_from = 0i32;
let lim_to = width_i + height_i;
let mut i = 0i32;
while i < std::cmp::min(width_i + height_i, lim_from) {
top_px[i as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
while i < std::cmp::min(lim_to, width_i + height_i) {
let mut s = 0i32;
for j in 0..5i32 {
s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
as i32
* KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
}
top_px[i as usize] = ((s + 8) >> 4) as u16;
i += 1;
}
while i < width_i + height_i {
top_px[i as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
max_base_x = (width_i + height_i - 1) as usize;
} else {
for i in 0..width + std::cmp::min(width, height) {
top_px[i] = rd(tl_pix + 1 + i);
}
max_base_x = width + std::cmp::min(width, height) - 1;
}
base_inc = 1;
};
let top_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(&top_px[..]);
let top_bytes = top_bytes.flex();
let rounding = _mm256_set1_epi32(32);
for y in 0..height_i {
let xpos = (y + 1) * dx;
let frac = (xpos & 0x3e) as i32;
let inv_frac = 64 - frac;
let frac_vec = _mm256_set1_epi32(frac);
let inv_frac_vec = _mm256_set1_epi32(inv_frac);
let row_off = (dst_base as isize + y as isize * stride) as usize;
let base0 = (xpos >> 6) as usize;
let mut x = 0usize;
if base_inc == 1 {
while x + 8 <= width && base0 + x + 8 < max_base_x {
let base = base0 + x;
let load0 = base * 2;
let load1 = (base + 1) * 2;
let t0 = loadu_128!((&top_bytes[load0..load0 + 16]), [u8; 16]);
let t1 = loadu_128!((&top_bytes[load1..load1 + 16]), [u8; 16]);
let t0_w = _mm256_cvtepu16_epi32(t0);
let t1_w = _mm256_cvtepu16_epi32(t1);
let prod0 = _mm256_mullo_epi32(t0_w, inv_frac_vec);
let prod1 = _mm256_mullo_epi32(t1_w, frac_vec);
let sum = _mm256_add_epi32(_mm256_add_epi32(prod0, prod1), rounding);
let result = _mm256_srai_epi32::<6>(sum);
let packed = _mm256_packus_epi32(result, result);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi64(lo, hi);
let store_off = row_off + x * 2;
storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], combined);
x += 8;
}
}
while x < width {
let base = base0 + base_inc * x;
if base < max_base_x {
let t0 = top_px[base] as i32;
let t1 = top_px[base + 1] as i32;
let v = t0 * inv_frac + t1 * frac;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
} else {
let fill_val = top_px[max_base_x];
for xx in x..width {
let off = row_off + xx * 2;
dst[off..off + 2].copy_from_slice(&fill_val.to_ne_bytes());
}
break;
}
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z1_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_z1_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
_bitdepth_max as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z2_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
max_width: i32,
max_height: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((angle - 90) >> 1) as usize] as i32;
let mut dx = dav1d_dr_intra_derivative[((180 - angle) >> 1) as usize] as i32;
let upsample_left = enable_intra_edge_filter
&& (180 - angle) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let upsample_above = enable_intra_edge_filter
&& (angle - 90) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut edge_px = [0u16; 64 + 64 + 1];
let edge_tl = 64usize;
let rd = |off: usize| -> u16 {
let b = off * 2;
u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
};
let tl_pix = tl_off / 2;
if upsample_above {
let kernel: [i8; 4] = [-1, 9, 9, -1];
let hsz = width_i + 1;
let in_off = tl_pix;
for i in 0..hsz - 1 {
edge_px[edge_tl + (i * 2) as usize] = rd(in_off + i.clamp(0, hsz - 1) as usize);
let mut s = 0i32;
for j in 0..4i32 {
s += rd(in_off + (i + j - 1).clamp(0, hsz - 1) as usize) as i32
* kernel[j as usize] as i32;
}
edge_px[edge_tl + (i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
let i = hsz - 1;
edge_px[edge_tl + (i * 2) as usize] = rd(in_off + i.clamp(0, hsz - 1) as usize);
dx <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 90, is_sm)
} else {
0
};
if filter_strength != 0 {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let in_off = tl_pix + 1;
let from = -1i32;
let to = width_i;
let lim_from = 0i32;
let lim_to = max_width;
let mut i = 0i32;
while i < std::cmp::min(width_i, lim_from) {
edge_px[edge_tl + 1 + i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
while i < std::cmp::min(lim_to, width_i) {
let mut s = 0i32;
for j in 0..5i32 {
s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
as i32
* KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
}
edge_px[edge_tl + 1 + i as usize] = ((s + 8) >> 4) as u16;
i += 1;
}
while i < width_i {
edge_px[edge_tl + 1 + i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
} else {
for i in 0..width {
edge_px[edge_tl + 1 + i] = rd(tl_pix + 1 + i);
}
}
}
if upsample_left {
let kernel: [i8; 4] = [-1, 9, 9, -1];
let hsz = height_i + 1;
let in_off = tl_pix - height;
for i in 0..hsz - 1 {
edge_px[edge_tl - height * 2 + (i * 2) as usize] =
rd(in_off + i.clamp(0, hsz - 1) as usize);
let mut s = 0i32;
for j in 0..4i32 {
s += rd(in_off + (i + j - 1).clamp(0, hsz - 1) as usize) as i32
* kernel[j as usize] as i32;
}
edge_px[edge_tl - height * 2 + (i * 2 + 1) as usize] =
((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
let i = hsz - 1;
edge_px[edge_tl - height * 2 + (i * 2) as usize] =
rd(in_off + i.clamp(0, hsz - 1) as usize);
dy <<= 1;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, 180 - angle, is_sm)
} else {
0
};
if filter_strength != 0 {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let in_off = tl_pix - height;
let from = 0i32;
let to = height_i + 1;
let lim_from = height_i - max_height;
let lim_to = height_i;
let mut i = 0i32;
while i < std::cmp::min(height_i, lim_from) {
edge_px[edge_tl - height + i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
while i < std::cmp::min(lim_to, height_i) {
let mut s = 0i32;
for j in 0..5i32 {
s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
as i32
* KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
}
edge_px[edge_tl - height + i as usize] = ((s + 8) >> 4) as u16;
i += 1;
}
while i < height_i {
edge_px[edge_tl - height + i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
} else {
for i in 0..height {
edge_px[edge_tl - height + i] = rd(tl_pix - height + i);
}
}
}
edge_px[edge_tl] = rd(tl_pix);
let edge_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(edge_px.as_slice());
let edge = edge_bytes.flex();
let base_inc_x = 1 + upsample_above as usize;
let left = edge_tl - (1 + upsample_left as usize);
let rounding = _mm256_set1_epi32(32);
for y in 0..height_i {
let xpos = ((1 + upsample_above as i32) << 6) - dx * (y + 1);
let base_x0 = xpos >> 6;
let frac_x = (xpos & 0x3e) as i32;
let inv_frac_x = 64 - frac_x;
let row_off = (dst_base as isize + y as isize * stride) as usize;
let left_count = if base_x0 >= 0 {
0usize
} else {
let needed = (-base_x0) as usize;
needed.div_ceil(base_inc_x).min(width)
};
let mut x = 0usize;
while x < left_count {
let ypos = (y << (6 + upsample_left as i32)) - dy * (x as i32 + 1);
let base_y = ypos >> 6;
let frac_y = ypos & 0x3e;
let inv_frac_y = 64 - frac_y;
let l0_pix = left.wrapping_add_signed(-base_y as isize);
let l1_pix = left.wrapping_add_signed(-(base_y + 1) as isize);
let l0_off = l0_pix * 2;
let l1_off = l1_pix * 2;
let l0 = u16::from_ne_bytes(edge[l0_off..l0_off + 2].try_into().unwrap()) as i32;
let l1 = u16::from_ne_bytes(edge[l1_off..l1_off + 2].try_into().unwrap()) as i32;
let v = l0 * inv_frac_y + l1 * frac_y;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
x += 1;
}
if base_inc_x == 1 {
while x + 8 <= width {
let base_x = (base_x0 + x as i32) as usize;
let load0 = (edge_tl + base_x) * 2;
let load1 = (edge_tl + base_x + 1) * 2;
if load1 + 16 > edge.len() {
break;
}
let t0 = loadu_128!((&edge[load0..load0 + 16]), [u8; 16]);
let t1 = loadu_128!((&edge[load1..load1 + 16]), [u8; 16]);
let t0_w = _mm256_cvtepu16_epi32(t0);
let t1_w = _mm256_cvtepu16_epi32(t1);
let frac_vec = _mm256_set1_epi32(frac_x);
let inv_frac_vec = _mm256_set1_epi32(inv_frac_x);
let prod0 = _mm256_mullo_epi32(t0_w, inv_frac_vec);
let prod1 = _mm256_mullo_epi32(t1_w, frac_vec);
let sum = _mm256_add_epi32(_mm256_add_epi32(prod0, prod1), rounding);
let result = _mm256_srai_epi32::<6>(sum);
let packed = _mm256_packus_epi32(result, result);
let lo = _mm256_castsi256_si128(packed);
let hi = _mm256_extracti128_si256::<1>(packed);
let combined = _mm_unpacklo_epi64(lo, hi);
let store_off = row_off + x * 2;
storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], combined);
x += 8;
}
}
while x < width {
let base_x = (base_x0 + (base_inc_x * x) as i32) as usize;
let t0_off = (edge_tl + base_x) * 2;
let t1_off = (edge_tl + base_x + 1) * 2;
if t1_off + 2 > edge.len() {
break;
}
let t0 = u16::from_ne_bytes(edge[t0_off..t0_off + 2].try_into().unwrap()) as i32;
let t1 = u16::from_ne_bytes(edge[t1_off..t1_off + 2].try_into().unwrap()) as i32;
let v = t0 * inv_frac_x + t1 * frac_x;
let off = row_off + x * 2;
dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
x += 1;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z2_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
max_width: c_int,
max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_z2_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
max_width as i32,
max_height as i32,
_bitdepth_max as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z3_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
angle: i32,
bitdepth_max: i32,
) {
let mut dst = dst.flex_mut();
let width_i = width as i32;
let height_i = height as i32;
let is_sm = (angle >> 9) & 1 != 0;
let enable_intra_edge_filter = (angle >> 10) != 0;
let angle = angle & 511;
let mut dy = dav1d_dr_intra_derivative[((270 - angle) >> 1) as usize] as usize;
let tl_pix = tl_off / 2;
let rd = |off: usize| -> u16 {
let b = off * 2;
u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
};
let upsample_left = enable_intra_edge_filter
&& (angle - 180) < 40
&& (width_i + height_i) <= (16 >> is_sm as i32);
let mut left_px = [0u16; 64 + 64];
let (left_off, max_base_y, base_inc);
let use_left_px;
if upsample_left {
let kernel: [i8; 4] = [-1, 9, 9, -1];
let hsz = width_i + height_i;
let in_off = tl_pix - (width + height);
let from = std::cmp::max(width_i - height_i, 0);
let to = width_i + height_i + 1;
for i in 0..hsz - 1 {
left_px[(i * 2) as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
let mut s = 0i32;
for j in 0..4i32 {
s += rd(in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize))
as i32
* kernel[j as usize] as i32;
}
left_px[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
}
let i = hsz - 1;
left_px[(i * 2) as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
left_off = (2 * (width_i + height_i) - 2) as usize;
max_base_y = left_off;
dy <<= 1;
base_inc = 2usize;
use_left_px = true;
} else {
let filter_strength = if enable_intra_edge_filter {
get_filter_strength_simple(width_i + height_i, angle - 180, is_sm)
} else {
0
};
if filter_strength != 0 {
static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
let in_off = tl_pix - (width + height);
let from = std::cmp::max(width_i - height_i, 0);
let to = width_i + height_i + 1;
let lim_from = 0i32;
let lim_to = width_i + height_i;
let mut i = 0i32;
while i < std::cmp::min(width_i + height_i, lim_from) {
left_px[i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
while i < std::cmp::min(lim_to, width_i + height_i) {
let mut s = 0i32;
for j in 0..5i32 {
s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
as i32
* KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
}
left_px[i as usize] = ((s + 8) >> 4) as u16;
i += 1;
}
while i < width_i + height_i {
left_px[i as usize] =
rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
i += 1;
}
left_off = (width_i + height_i - 1) as usize;
max_base_y = left_off;
use_left_px = true;
} else {
left_off = 0; max_base_y = height + std::cmp::min(width, height) - 1;
use_left_px = false;
}
base_inc = 1;
};
for x in 0..width {
let ypos = dy * (x + 1);
let frac = (ypos & 0x3e) as i32;
let inv_frac = 64 - frac;
for y in 0..height_i {
let base = (ypos >> 6) + base_inc * y as usize;
if base < max_base_y {
let (l0, l1) = if use_left_px {
(
left_px[left_off - base] as i32,
left_px[left_off - base - 1] as i32,
)
} else {
(rd(tl_pix - base - 1) as i32, rd(tl_pix - base - 2) as i32)
};
let v = l0 * inv_frac + l1 * frac;
let pixel_off = (dst_base as isize + y as isize * stride) as usize + x * 2;
dst[pixel_off..pixel_off + 2]
.copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
} else {
let fill_val = if use_left_px {
left_px[left_off - max_base_y]
} else {
rd(tl_pix - max_base_y - 1)
};
for yy in y..height_i {
let pixel_off = (dst_base as isize + yy as isize * stride) as usize + x * 2;
dst[pixel_off..pixel_off + 2].copy_from_slice(&fill_val.to_ne_bytes());
}
break;
}
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z3_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
angle: c_int,
_max_width: c_int,
_max_height: c_int,
_bitdepth_max: c_int,
_topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_z3_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
angle as i32,
_bitdepth_max as i32,
);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_filter_16bpc_inner(
_token: Desktop64,
dst: &mut [u8],
dst_base: usize,
stride: isize,
topleft: &[u8],
tl_off: usize,
width: usize,
height: usize,
filt_idx: i32,
bitdepth_max: i32,
topleft_off: usize,
) {
let mut dst = dst.flex_mut();
let topleft = topleft.flex();
let width = (width as usize / 4) * 4; let filt_idx = (filt_idx as usize) & 511;
let filter = &dav1d_filter_intra_taps[filt_idx];
for y in (0..height).step_by(2) {
let cur_tl_off = topleft_off - y;
let tl_pixel_off = tl_off.wrapping_add(cur_tl_off * 2);
let mut tl_pixel =
u16::from_ne_bytes(topleft[tl_pixel_off..tl_pixel_off + 2].try_into().unwrap()) as i32;
let row0_off = (dst_base as isize + y as isize * stride) as usize;
let row1_off = (dst_base as isize + (y + 1) as isize * stride) as usize;
for x in (0..width).step_by(4) {
let (p1, p2, p3, p4) = if y == 0 {
let top_base = tl_off.wrapping_add((topleft_off + 1 + x) * 2);
(
u16::from_ne_bytes(topleft[top_base..top_base + 2].try_into().unwrap()) as i32,
u16::from_ne_bytes(topleft[top_base + 2..top_base + 4].try_into().unwrap())
as i32,
u16::from_ne_bytes(topleft[top_base + 4..top_base + 6].try_into().unwrap())
as i32,
u16::from_ne_bytes(topleft[top_base + 6..top_base + 8].try_into().unwrap())
as i32,
)
} else {
let top_row = (dst_base as isize + (y as isize - 1) * stride) as usize;
let tb = top_row + x * 2;
(
u16::from_ne_bytes(dst[tb..tb + 2].try_into().unwrap()) as i32,
u16::from_ne_bytes(dst[tb + 2..tb + 4].try_into().unwrap()) as i32,
u16::from_ne_bytes(dst[tb + 4..tb + 6].try_into().unwrap()) as i32,
u16::from_ne_bytes(dst[tb + 6..tb + 8].try_into().unwrap()) as i32,
)
};
let (p5, p6) = if x == 0 {
let left_base = tl_off.wrapping_add(cur_tl_off.wrapping_sub(1) * 2);
let left_base2 = tl_off.wrapping_add(cur_tl_off.wrapping_sub(2) * 2);
(
u16::from_ne_bytes(topleft[left_base..left_base + 2].try_into().unwrap())
as i32,
u16::from_ne_bytes(topleft[left_base2..left_base2 + 2].try_into().unwrap())
as i32,
)
} else {
let p5_off = row0_off + (x - 1) * 2;
let p6_off = row1_off + (x - 1) * 2;
(
u16::from_ne_bytes(dst[p5_off..p5_off + 2].try_into().unwrap()) as i32,
u16::from_ne_bytes(dst[p6_off..p6_off + 2].try_into().unwrap()) as i32,
)
};
let p0 = tl_pixel;
let p = [p0, p1, p2, p3, p4, p5, p6];
let flt = filter.as_slice();
let mut flt_offset = 0;
for xx in 0..4 {
let acc = filter_fn(&flt[flt_offset..], p);
let val = ((acc + 8) >> 4).clamp(0, bitdepth_max as i32) as u16;
let off = row0_off + (x + xx) * 2;
dst[off..off + 2].copy_from_slice(&val.to_ne_bytes());
flt_offset += FLT_INCR;
}
for xx in 0..4 {
let acc = filter_fn(&flt[flt_offset..], p);
let val = ((acc + 8) >> 4).clamp(0, bitdepth_max as i32) as u16;
let off = row1_off + (x + xx) * 2;
dst[off..off + 2].copy_from_slice(&val.to_ne_bytes());
flt_offset += FLT_INCR;
}
tl_pixel = p4;
}
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_filter_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
topleft: *const DynPixel,
width: c_int,
height: c_int,
filt_idx: c_int,
_max_width: c_int,
_max_height: c_int,
bitdepth_max: c_int,
topleft_off: usize,
_dst: *const FFISafe<PicOffset>,
) {
let token = unsafe { Desktop64::forge_token_dangerously() };
let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let (tl_sl, tl_off) = compute_topleft_slice(
topleft as *const u8,
width as usize * 2,
height as usize * 2,
);
ipred_filter_16bpc_inner(
token,
dst_sl,
0,
stride as isize,
tl_sl,
tl_off,
width as usize,
height as usize,
filt_idx as i32,
bitdepth_max as i32,
topleft_off,
);
}
use crate::include::common::bitdepth::BitDepth;
use crate::src::internal::SCRATCH_EDGE_LEN;
#[cfg(target_arch = "x86_64")]
pub fn intra_pred_dispatch<BD: BitDepth>(
mode: usize,
dst: PicOffset,
topleft: &[BD::Pixel; SCRATCH_EDGE_LEN],
topleft_off: usize,
width: c_int,
height: c_int,
angle: c_int,
max_width: c_int,
max_height: c_int,
bd: BD,
) -> bool {
use crate::include::common::bitdepth::BPC;
use zerocopy::IntoBytes;
let Some(token) = crate::src::cpu::summon_avx2() else {
return false;
};
#[cfg(target_arch = "x86_64")]
let avx512_token = crate::src::cpu::summon_avx512();
#[cfg(not(target_arch = "x86_64"))]
let avx512_token: Option<Server64> = None;
let w = width as usize;
let h = height as usize;
let bd_c = bd.into_c();
let tl_bytes: &[u8] = topleft.as_bytes();
crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
&dst,
w,
h,
|dst_bytes, dst_base_bytes, byte_stride| {
match (BD::BPC, mode) {
(BPC::BPC8, 0) => {
if let Some(t512) = avx512_token {
ipred_dc_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_dc_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 1) => {
if let Some(t512) = avx512_token {
ipred_v_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_v_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 2) => {
if let Some(t512) = avx512_token {
ipred_h_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_h_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 3) => {
if let Some(t512) = avx512_token {
ipred_dc_left_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_dc_left_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 4) => {
if let Some(t512) = avx512_token {
ipred_dc_top_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_dc_top_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 5) => {
if let Some(t512) = avx512_token {
ipred_dc_128_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
w,
h,
)
} else {
ipred_dc_128_8bpc_inner(token, dst_bytes, dst_base_bytes, byte_stride, w, h)
}
}
(BPC::BPC8, 6) => {
ipred_z1_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
);
}
(BPC::BPC8, 7) => {
ipred_z2_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
max_width,
max_height,
);
}
(BPC::BPC8, 8) => {
ipred_z3_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
angle as i32,
);
}
(BPC::BPC8, 9) => {
if let Some(t512) = avx512_token {
ipred_smooth_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_smooth_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 10) => {
if let Some(t512) = avx512_token {
ipred_smooth_v_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_smooth_v_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 11) => {
if let Some(t512) = avx512_token {
ipred_smooth_h_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_smooth_h_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 12) => {
if let Some(t512) = avx512_token {
ipred_paeth_8bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
} else {
ipred_paeth_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
topleft_off,
w,
h,
)
}
}
(BPC::BPC8, 13) => {
ipred_filter_8bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
0, w,
h,
angle as i32,
topleft_off,
)
}
(BPC::BPC16, 0) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_dc_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_dc_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 1) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_v_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_v_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 2) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_h_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_h_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 3) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_dc_left_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_dc_left_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 4) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_dc_top_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_dc_top_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 5) => {
if let Some(t512) = avx512_token {
ipred_dc_128_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
w,
h,
bd_c as i32,
)
} else {
ipred_dc_128_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
w,
h,
bd_c as i32,
)
}
}
(BPC::BPC16, 6) => {
let tl_off_bytes = topleft_off * 2;
ipred_z1_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
angle as i32,
bd_c,
);
}
(BPC::BPC16, 7) => {
let tl_off_bytes = topleft_off * 2;
ipred_z2_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
angle as i32,
max_width,
max_height,
bd_c,
);
}
(BPC::BPC16, 8) => {
let tl_off_bytes = topleft_off * 2;
ipred_z3_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
angle as i32,
bd_c,
);
}
(BPC::BPC16, 9) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_smooth_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_smooth_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 10) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_smooth_v_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_smooth_v_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 11) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_smooth_h_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_smooth_h_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 12) => {
let tl_off_bytes = topleft_off * 2;
if let Some(t512) = avx512_token {
ipred_paeth_16bpc_avx512_inner(
t512,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
} else {
ipred_paeth_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
tl_off_bytes,
w,
h,
)
}
}
(BPC::BPC16, 13) => {
ipred_filter_16bpc_inner(
token,
dst_bytes,
dst_base_bytes,
byte_stride,
tl_bytes,
0, w,
h,
angle as i32,
bd_c as i32,
topleft_off,
)
}
_ => return false,
}
true
},
) }