#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#![cfg(feature = "x86")]
#![allow(
clippy::wildcard_imports,
clippy::cast_possible_truncation,
clippy::too_many_arguments,
clippy::inline_always,
clippy::doc_markdown
)]
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
pub union YmmRegister
{
mm256: __m256i,
array: [i16; 16],
}
#[inline(always)]
pub fn ycbcr_to_rgb_avx2(
y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize,
)
{
unsafe {
ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
}
}
#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx")]
unsafe fn ycbcr_to_rgb_avx2_1(
y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize,
)
{
let tmp: &mut [u8; 48] = out
.get_mut(*offset..*offset + 48)
.expect("Slice to small cannot write")
.try_into()
.unwrap();
let (r, g, b) = ycbcr_to_rgb_baseline(y, cb, cr);
let mut j = 0;
let mut i = 0;
while i < 48
{
tmp[i] = r.array[j] as u8;
tmp[i + 1] = g.array[j] as u8;
tmp[i + 2] = b.array[j] as u8;
i += 3;
j += 1;
}
*offset += 48;
}
#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "avx")]
unsafe fn ycbcr_to_rgb_baseline(
y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
) -> (YmmRegister, YmmRegister, YmmRegister)
{
let y_c = _mm256_loadu_si256(y.as_ptr().cast());
let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
let r2 = _mm256_srai_epi16::<5>(r1);
let r = YmmRegister {
mm256: clamp_avx(_mm256_add_epi16(y_c, r2)),
};
let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
let g3 = _mm256_add_epi16(g1, g2);
let g4 = _mm256_srai_epi16::<5>(g3);
let g = YmmRegister {
mm256: clamp_avx(_mm256_sub_epi16(y_c, g4)),
};
let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
let b2 = _mm256_srai_epi16::<6>(b1);
let b = YmmRegister {
mm256: clamp_avx(_mm256_add_epi16(b2, y_c)),
};
return (r, g, b);
}
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn ycbcr_to_rgb_baseline_no_clamp(
y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
) -> (__m256i, __m256i, __m256i)
{
let y_c = _mm256_loadu_si256(y.as_ptr().cast());
let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
let cb_r = _mm256_sub_epi16(cb_c, _mm256_set1_epi16(128));
let cr_r = _mm256_sub_epi16(cr_c, _mm256_set1_epi16(128));
let r1 = _mm256_mullo_epi16(_mm256_set1_epi16(45), cr_r);
let r2 = _mm256_srai_epi16::<5>(r1);
let r = _mm256_add_epi16(y_c, r2);
let g1 = _mm256_mullo_epi16(_mm256_set1_epi16(11), cb_r);
let g2 = _mm256_mullo_epi16(_mm256_set1_epi16(23), cr_r);
let g3 = _mm256_add_epi16(g1, g2);
let g4 = _mm256_srai_epi16::<5>(g3);
let g = _mm256_sub_epi16(y_c, g4);
let b1 = _mm256_mullo_epi16(_mm256_set1_epi16(113), cb_r);
let b2 = _mm256_srai_epi16::<6>(b1);
let b = _mm256_add_epi16(b2, y_c);
return (r, g, b);
}
#[inline(always)]
pub fn ycbcr_to_rgba_avx2(
y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize,
)
{
unsafe {
ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
}
}
#[inline]
#[target_feature(enable = "avx2")]
#[rustfmt::skip]
unsafe fn ycbcr_to_rgba_unsafe(
y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
out: &mut [u8],
offset: &mut usize,
)
{
let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();
let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
let c = _mm256_packus_epi16(r, g); let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); let e = _mm256_unpacklo_epi8(c, d); let f = _mm256_unpackhi_epi8(c, d); let g = _mm256_unpacklo_epi8(e, f); let h = _mm256_unpackhi_epi8(e, f);
let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
_mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
_mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
*offset += 64;
}
#[inline(always)]
pub fn ycbcr_to_rgbx_avx2(
y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize,
)
{
unsafe {
ycbcr_to_rgbx_unsafe(y, cb, cr, out, offset);
}
}
#[inline]
#[allow(clippy::cast_possible_wrap)]
#[target_feature(enable = "avx2")]
#[rustfmt::skip]
unsafe fn ycbcr_to_rgbx_unsafe(
y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
out: &mut [u8],
offset: &mut usize,
)
{
let length = out.len();
let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).unwrap_or_else(|| panic!("Slice to small cannot write,size:{} position:{}",length,offset)).try_into().unwrap();
let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
let c = _mm256_packus_epi16(r, g); let d = _mm256_packus_epi16(b, _mm256_undefined_si256()); let e = _mm256_unpacklo_epi8(c, d); let f = _mm256_unpackhi_epi8(c, d); let g = _mm256_unpacklo_epi8(e, f); let h = _mm256_unpackhi_epi8(e, f);
let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
_mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
_mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
*offset += 64;
}
#[inline]
#[target_feature(enable = "avx2")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn clamp_avx(reg: __m256i) -> __m256i
{
let min_s = _mm256_set1_epi16(0);
let max_s = _mm256_set1_epi16(255);
let max_v = _mm256_max_epi16(reg, min_s); let min_v = _mm256_min_epi16(max_v, max_s); return min_v;
}
#[inline]
const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32
{
((z << 6) | (y << 4) | (x << 2) | w) as i32
}