use alloc::vec::Vec;
use core::arch::x86_64::{
__m128i, __m256i, _mm256_add_epi32, _mm256_cvtepu8_epi32, _mm256_extracti128_si256,
_mm256_mullo_epi32, _mm256_set1_epi32, _mm256_srai_epi32, _mm256_sub_epi32, _mm_cvtsi128_si64,
_mm_loadl_epi64, _mm_packs_epi32, _mm_packus_epi16,
};
use core::cell::RefCell;
use crate::color::upsample::{
h2v2_fancy_sample, upsample_h2v2_fancy_row, upsample_h2v2_fancy_rows,
};
use crate::color::ycbcr::{FIX_0_34414, FIX_0_71414, FIX_1_40200, FIX_1_77200, ROUND};
use super::scalar;
const LANES: usize = 8;
const RGB_UNROLL: usize = 8;
#[derive(Default)]
struct RowPairScratch {
cb_top: Vec<u8>,
cb_bottom: Vec<u8>,
cr_top: Vec<u8>,
cr_bottom: Vec<u8>,
}
impl RowPairScratch {
fn ensure_width(&mut self, width: usize) {
self.cb_top.resize(width, 0);
self.cb_bottom.resize(width, 0);
self.cr_top.resize(width, 0);
self.cr_bottom.resize(width, 0);
}
}
std::thread_local! {
static ROW_PAIR_SCRATCH: RefCell<RowPairScratch> = RefCell::new(RowPairScratch::default());
}
pub(crate) fn fill_rgb_row_from_gray(gray_row: &[u8], dst: &mut [u8]) {
let width = gray_row.len().min(dst.len() / 3);
let gray_row = &gray_row[..width];
let dst = &mut dst[..width * 3];
debug_assert_eq!(dst.len(), gray_row.len() * 3);
let mut offset = 0;
while offset + RGB_UNROLL <= gray_row.len() {
let chunk = &gray_row[offset..offset + RGB_UNROLL];
let dst_chunk = &mut dst[offset * 3..(offset + RGB_UNROLL) * 3];
for (gray, pixel) in chunk.iter().zip(dst_chunk.chunks_exact_mut(3)) {
pixel[0] = *gray;
pixel[1] = *gray;
pixel[2] = *gray;
}
offset += RGB_UNROLL;
}
if offset < gray_row.len() {
scalar::fill_rgb_row_from_gray(&gray_row[offset..], &mut dst[offset * 3..]);
}
}
pub(crate) fn fill_rgb_row_from_rgb(r_row: &[u8], g_row: &[u8], b_row: &[u8], dst: &mut [u8]) {
let width = r_row
.len()
.min(g_row.len())
.min(b_row.len())
.min(dst.len() / 3);
let r_row = &r_row[..width];
let g_row = &g_row[..width];
let b_row = &b_row[..width];
let dst = &mut dst[..width * 3];
debug_assert_eq!(r_row.len(), g_row.len());
debug_assert_eq!(r_row.len(), b_row.len());
debug_assert_eq!(dst.len(), r_row.len() * 3);
let mut offset = 0;
while offset + RGB_UNROLL <= r_row.len() {
let r_chunk = &r_row[offset..offset + RGB_UNROLL];
let g_chunk = &g_row[offset..offset + RGB_UNROLL];
let b_chunk = &b_row[offset..offset + RGB_UNROLL];
let dst_chunk = &mut dst[offset * 3..(offset + RGB_UNROLL) * 3];
for (((&r, &g), &b), pixel) in r_chunk
.iter()
.zip(g_chunk.iter())
.zip(b_chunk.iter())
.zip(dst_chunk.chunks_exact_mut(3))
{
pixel[0] = r;
pixel[1] = g;
pixel[2] = b;
}
offset += RGB_UNROLL;
}
if offset < r_row.len() {
scalar::fill_rgb_row_from_rgb(
&r_row[offset..],
&g_row[offset..],
&b_row[offset..],
&mut dst[offset * 3..],
);
}
}
pub(crate) fn fill_rgb_row_from_ycbcr(y_row: &[u8], cb_row: &[u8], cr_row: &[u8], dst: &mut [u8]) {
let width = y_row
.len()
.min(cb_row.len())
.min(cr_row.len())
.min(dst.len() / 3);
let y_row = &y_row[..width];
let cb_row = &cb_row[..width];
let cr_row = &cr_row[..width];
let dst = &mut dst[..width * 3];
debug_assert_eq!(y_row.len(), cb_row.len());
debug_assert_eq!(y_row.len(), cr_row.len());
debug_assert_eq!(dst.len(), y_row.len() * 3);
unsafe {
fill_rgb_row_from_ycbcr_avx2(y_row, cb_row, cr_row, dst);
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn fill_rgb_row_pair_from_420(
y_top: &[u8],
y_bottom: Option<&[u8]>,
prev_cb: &[u8],
curr_cb: &[u8],
next_cb: &[u8],
prev_cr: &[u8],
curr_cr: &[u8],
next_cr: &[u8],
dst_top: &mut [u8],
dst_bottom: Option<&mut [u8]>,
) {
let chroma_width = prev_cb
.len()
.min(curr_cb.len())
.min(next_cb.len())
.min(prev_cr.len())
.min(curr_cr.len())
.min(next_cr.len());
let bottom_width = match (y_bottom.as_ref(), dst_bottom.as_ref()) {
(Some(row), Some(dst)) => row.len().min(dst.len() / 3),
_ => usize::MAX,
};
let width = y_top
.len()
.min(dst_top.len() / 3)
.min(bottom_width)
.min(chroma_width.saturating_mul(2));
if width == 0 {
return;
}
let y_top = &y_top[..width];
let y_bottom = y_bottom.and_then(|row| row.get(..width));
let prev_cb = &prev_cb[..chroma_width];
let curr_cb = &curr_cb[..chroma_width];
let next_cb = &next_cb[..chroma_width];
let prev_cr = &prev_cr[..chroma_width];
let curr_cr = &curr_cr[..chroma_width];
let next_cr = &next_cr[..chroma_width];
let dst_top = &mut dst_top[..width * 3];
let dst_bottom = dst_bottom.and_then(|row| row.get_mut(..width * 3));
debug_assert_eq!(dst_top.len(), y_top.len() * 3);
debug_assert!(y_bottom.is_none_or(|row| row.len() == y_top.len()));
debug_assert!(dst_bottom
.as_ref()
.is_none_or(|row| row.len() == y_top.len() * 3));
debug_assert_eq!(prev_cb.len(), curr_cb.len());
debug_assert_eq!(prev_cb.len(), next_cb.len());
debug_assert_eq!(prev_cr.len(), curr_cr.len());
debug_assert_eq!(prev_cr.len(), next_cr.len());
ROW_PAIR_SCRATCH.with(|scratch| {
let mut scratch = scratch.borrow_mut();
scratch.ensure_width(y_top.len());
unsafe {
fill_rgb_row_pair_from_420_avx2(
y_top,
y_bottom,
prev_cb,
curr_cb,
next_cb,
prev_cr,
curr_cr,
next_cr,
dst_top,
dst_bottom,
&mut scratch,
);
}
});
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn fill_rgb_row_pair_from_420_cropped(
y_top: &[u8],
y_bottom: Option<&[u8]>,
prev_cb: &[u8],
curr_cb: &[u8],
next_cb: &[u8],
prev_cr: &[u8],
curr_cr: &[u8],
next_cr: &[u8],
crop_start: usize,
crop_width: usize,
dst_top: &mut [u8],
dst_bottom: Option<&mut [u8]>,
) {
let chroma_width = prev_cb
.len()
.min(curr_cb.len())
.min(next_cb.len())
.min(prev_cr.len())
.min(curr_cr.len())
.min(next_cr.len());
let available_chroma = chroma_width.saturating_mul(2).saturating_sub(crop_start);
let available_top = y_top.len().saturating_sub(crop_start);
let bottom_available = match (y_bottom.as_ref(), dst_bottom.as_ref()) {
(Some(row), Some(dst)) => row.len().saturating_sub(crop_start).min(dst.len() / 3),
_ => usize::MAX,
};
let width = crop_width
.min(available_top)
.min(dst_top.len() / 3)
.min(bottom_available)
.min(available_chroma);
if width == 0 {
return;
}
let Some(crop_end) = crop_start.checked_add(width) else {
return;
};
let Some(y_top_crop) = y_top.get(crop_start..crop_end) else {
return;
};
let y_bottom = y_bottom.and_then(|row| row.get(crop_start..crop_end));
let prev_cb = &prev_cb[..chroma_width];
let curr_cb = &curr_cb[..chroma_width];
let next_cb = &next_cb[..chroma_width];
let prev_cr = &prev_cr[..chroma_width];
let curr_cr = &curr_cr[..chroma_width];
let next_cr = &next_cr[..chroma_width];
let dst_top = &mut dst_top[..width * 3];
let dst_bottom = dst_bottom.and_then(|row| row.get_mut(..width * 3));
debug_assert_eq!(dst_top.len(), width * 3);
debug_assert!(y_bottom.is_none_or(|row| row.len() == width));
debug_assert!(dst_bottom.as_ref().is_none_or(|row| row.len() == width * 3));
debug_assert_eq!(prev_cb.len(), curr_cb.len());
debug_assert_eq!(prev_cb.len(), next_cb.len());
debug_assert_eq!(prev_cr.len(), curr_cr.len());
debug_assert_eq!(prev_cr.len(), next_cr.len());
ROW_PAIR_SCRATCH.with(|scratch| {
let mut scratch = scratch.borrow_mut();
scratch.ensure_width(width);
let RowPairScratch {
cb_top,
cb_bottom,
cr_top,
cr_bottom,
} = &mut *scratch;
let cb_top = &mut cb_top[..width];
let cr_top = &mut cr_top[..width];
fill_cropped_h2v2_row(prev_cb, curr_cb, crop_start, cb_top);
fill_cropped_h2v2_row(prev_cr, curr_cr, crop_start, cr_top);
unsafe {
fill_rgb_row_from_ycbcr_avx2(y_top_crop, cb_top, cr_top, dst_top);
}
if let (Some(y_bottom), Some(dst_bottom)) = (y_bottom, dst_bottom) {
let cb_bottom = &mut cb_bottom[..width];
let cr_bottom = &mut cr_bottom[..width];
fill_cropped_h2v2_row(next_cb, curr_cb, crop_start, cb_bottom);
fill_cropped_h2v2_row(next_cr, curr_cr, crop_start, cr_bottom);
unsafe {
fill_rgb_row_from_ycbcr_avx2(y_bottom, cb_bottom, cr_bottom, dst_bottom);
}
}
});
}
fn fill_cropped_h2v2_row(near: &[u8], curr: &[u8], crop_start: usize, out: &mut [u8]) {
for (local_x, slot) in out.iter_mut().enumerate() {
*slot = h2v2_fancy_sample(near, curr, crop_start + local_x);
}
}
#[target_feature(enable = "avx2")]
#[allow(clippy::too_many_arguments)]
unsafe fn fill_rgb_row_pair_from_420_avx2(
y_top: &[u8],
y_bottom: Option<&[u8]>,
prev_cb: &[u8],
curr_cb: &[u8],
next_cb: &[u8],
prev_cr: &[u8],
curr_cr: &[u8],
next_cr: &[u8],
dst_top: &mut [u8],
dst_bottom: Option<&mut [u8]>,
scratch: &mut RowPairScratch,
) {
let width = y_top.len();
let RowPairScratch {
cb_top,
cb_bottom,
cr_top,
cr_bottom,
} = scratch;
let cb_top = &mut cb_top[..width];
let cr_top = &mut cr_top[..width];
if let (Some(y_bottom), Some(dst_bottom)) = (y_bottom, dst_bottom) {
let cb_bottom = &mut cb_bottom[..width];
let cr_bottom = &mut cr_bottom[..width];
upsample_h2v2_fancy_rows(prev_cb, curr_cb, next_cb, width, cb_top, cb_bottom);
upsample_h2v2_fancy_rows(prev_cr, curr_cr, next_cr, width, cr_top, cr_bottom);
unsafe {
fill_rgb_row_from_ycbcr_avx2(y_top, cb_top, cr_top, dst_top);
fill_rgb_row_from_ycbcr_avx2(y_bottom, cb_bottom, cr_bottom, dst_bottom);
}
} else {
upsample_h2v2_fancy_row(prev_cb, curr_cb, next_cb, width, false, cb_top);
upsample_h2v2_fancy_row(prev_cr, curr_cr, next_cr, width, false, cr_top);
unsafe {
fill_rgb_row_from_ycbcr_avx2(y_top, cb_top, cr_top, dst_top);
}
}
}
#[cfg(test)]
pub(super) fn fill_rgb_row_from_ycbcr_for_test(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
dst: &mut [u8],
) {
fill_rgb_row_from_ycbcr(y_row, cb_row, cr_row, dst);
}
#[cfg(test)]
pub(super) fn fill_rgb_row_from_gray_for_test(gray_row: &[u8], dst: &mut [u8]) {
fill_rgb_row_from_gray(gray_row, dst);
}
#[cfg(test)]
pub(super) fn fill_rgb_row_from_rgb_for_test(
r_row: &[u8],
g_row: &[u8],
b_row: &[u8],
dst: &mut [u8],
) {
fill_rgb_row_from_rgb(r_row, g_row, b_row, dst);
}
#[target_feature(enable = "avx2")]
unsafe fn fill_rgb_row_from_ycbcr_avx2(y_row: &[u8], cb_row: &[u8], cr_row: &[u8], dst: &mut [u8]) {
let width = y_row.len();
let mut offset = 0;
while offset + (LANES * 2) <= width {
unsafe {
fill_chunk(
y_row,
cb_row,
cr_row,
&mut dst[offset * 3..(offset + LANES) * 3],
offset,
);
fill_chunk(
y_row,
cb_row,
cr_row,
&mut dst[(offset + LANES) * 3..(offset + LANES * 2) * 3],
offset + LANES,
);
}
offset += LANES * 2;
}
while offset + LANES <= width {
unsafe {
fill_chunk(
y_row,
cb_row,
cr_row,
&mut dst[offset * 3..(offset + LANES) * 3],
offset,
);
}
offset += LANES;
}
if offset < width {
scalar::fill_rgb_row_from_ycbcr(
&y_row[offset..],
&cb_row[offset..],
&cr_row[offset..],
&mut dst[offset * 3..],
);
}
}
#[target_feature(enable = "avx2")]
unsafe fn fill_chunk(
y_row: &[u8],
cb_row: &[u8],
cr_row: &[u8],
dst_chunk: &mut [u8],
offset: usize,
) {
debug_assert_eq!(dst_chunk.len(), LANES * 3);
let y = unsafe { load_eight(y_row, offset) };
let cb = unsafe { load_eight(cb_row, offset) };
let cr = unsafe { load_eight(cr_row, offset) };
let bias = _mm256_set1_epi32(128);
let y32 = _mm256_cvtepu8_epi32(y);
let cb32 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(cb), bias);
let cr32 = _mm256_sub_epi32(_mm256_cvtepu8_epi32(cr), bias);
let r = _mm256_add_epi32(y32, fixed_mul_shift(cr32, FIX_1_40200));
let g = _mm256_sub_epi32(
y32,
_mm256_srai_epi32(
_mm256_add_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(cb32, _mm256_set1_epi32(FIX_0_34414)),
_mm256_mullo_epi32(cr32, _mm256_set1_epi32(FIX_0_71414)),
),
_mm256_set1_epi32(ROUND),
),
16,
),
);
let b = _mm256_add_epi32(y32, fixed_mul_shift(cb32, FIX_1_77200));
unsafe {
store_rgb_chunk(dst_chunk, r, g, b);
}
}
#[target_feature(enable = "avx2")]
unsafe fn load_eight(src: &[u8], offset: usize) -> __m128i {
debug_assert!(offset <= src.len().saturating_sub(LANES));
unsafe { _mm_loadl_epi64(src.as_ptr().add(offset).cast()) }
}
#[target_feature(enable = "avx2")]
fn fixed_mul_shift(values: __m256i, coefficient: i32) -> __m256i {
_mm256_srai_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(values, _mm256_set1_epi32(coefficient)),
_mm256_set1_epi32(ROUND),
),
16,
)
}
#[target_feature(enable = "avx2")]
unsafe fn store_rgb_chunk(dst_chunk: &mut [u8], r: __m256i, g: __m256i, b: __m256i) {
debug_assert_eq!(dst_chunk.len(), LANES * 3);
let r_bytes = unsafe { pack_eight_u8(r) };
let g_bytes = unsafe { pack_eight_u8(g) };
let b_bytes = unsafe { pack_eight_u8(b) };
for ((((r, g), b), pixel), _) in r_bytes
.iter()
.zip(g_bytes.iter())
.zip(b_bytes.iter())
.zip(dst_chunk.chunks_exact_mut(3))
.zip(0..LANES)
{
pixel[0] = *r;
pixel[1] = *g;
pixel[2] = *b;
}
}
#[target_feature(enable = "avx2")]
unsafe fn pack_eight_u8(values: __m256i) -> [u8; LANES] {
let words = _mm_packs_epi32(
_mm256_extracti128_si256(values, 0),
_mm256_extracti128_si256(values, 1),
);
let bytes = _mm_packus_epi16(words, words);
_mm_cvtsi128_si64(bytes).to_ne_bytes()
}