#![cfg_attr(not(feature = "std"), allow(dead_code))]
use core::arch::x86_64::*;
use crate::row::{
arch::x86_avx512::endian::{load_endian_u16x32, load_endian_u32x16},
scalar::{bits_mask, gray as scalar},
};
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray8_to_rgb_row(
y_plane: &[u8],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray8_to_rgb_row(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray8_to_rgba_row(
y_plane: &[u8],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray8_to_rgba_row(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray8_to_hsv_row(
y_plane: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray8_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
}
let mut x = 0usize;
unsafe {
let zero = _mm512_setzero_si512();
while x + 64 <= width {
let v = _mm512_loadu_si512(y_plane.as_ptr().add(x).cast());
_mm512_storeu_si512(h_out.as_mut_ptr().add(x).cast(), zero);
_mm512_storeu_si512(s_out.as_mut_ptr().add(x).cast(), zero);
_mm512_storeu_si512(v_out.as_mut_ptr().add(x).cast(), v);
x += 64;
}
}
if x < width {
scalar::gray8_to_hsv_row(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = _mm512_set1_epi16(mask as i16);
let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
while x + 32 <= width {
let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = _mm512_and_si512(raw, mask_v);
let shifted = _mm512_srl_epi16(masked, shr);
let packed = _mm512_cvtepi16_epi8(shifted);
_mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), packed);
x += 32;
}
}
if x < width {
scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = _mm512_set1_epi16(mask as i16);
while x + 32 <= width {
let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = _mm512_and_si512(raw, mask_v);
_mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), masked);
x += 32;
}
}
if x < width {
scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
}
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = _mm512_set1_epi16(mask as i16);
let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
let zero256 = _mm256_setzero_si256();
while x + 32 <= width {
let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = _mm512_and_si512(raw, mask_v);
let shifted = _mm512_srl_epi16(masked, shr);
let packed = _mm512_cvtepi16_epi8(shifted);
_mm256_storeu_si256(h_out.as_mut_ptr().add(x).cast(), zero256);
_mm256_storeu_si256(s_out.as_mut_ptr().add(x).cast(), zero256);
_mm256_storeu_si256(v_out.as_mut_ptr().add(x).cast(), packed);
x += 32;
}
}
if x < width {
scalar::gray_n_to_hsv_row::<BITS, BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 32 <= width {
let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let shifted = _mm512_srli_epi16(raw, 8);
let packed = _mm512_cvtepi16_epi8(shifted);
_mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), packed);
x += 32;
}
}
if x < width {
scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 32 <= width {
let y = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
_mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y);
x += 32;
}
}
if x < width {
scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
y_plane: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
}
let mut x = 0usize;
unsafe {
let zero256 = _mm256_setzero_si256();
while x + 32 <= width {
let raw = load_endian_u16x32::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let shifted = _mm512_srli_epi16(raw, 8);
let packed = _mm512_cvtepi16_epi8(shifted);
_mm256_storeu_si256(h_out.as_mut_ptr().add(x).cast(), zero256);
_mm256_storeu_si256(s_out.as_mut_ptr().add(x).cast(), zero256);
_mm256_storeu_si256(v_out.as_mut_ptr().add(x).cast(), packed);
x += 32;
}
}
if x < width {
scalar::gray16_to_hsv_row::<BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
let scale = _mm512_set1_ps(255.0);
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
_mm512_mul_ps(clamped, scale),
_mm512_set1_ps(0.5),
));
let pack8: __m128i = _mm512_cvtusepi32_epi8(int32);
let mut ybuf = [0u8; 16];
_mm_storeu_si128(ybuf.as_mut_ptr().cast(), pack8);
for (i, &v) in ybuf.iter().enumerate() {
let base = (x + i) * 3;
out[base] = v;
out[base + 1] = v;
out[base + 2] = v;
}
x += 16;
}
}
if x < width {
scalar::grayf32_to_rgb_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
let scale = _mm512_set1_ps(255.0);
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
_mm512_mul_ps(clamped, scale),
_mm512_set1_ps(0.5),
));
let pack8: __m128i = _mm512_cvtusepi32_epi8(int32);
let mut ybuf = [0u8; 16];
_mm_storeu_si128(ybuf.as_mut_ptr().cast(), pack8);
for (i, &v) in ybuf.iter().enumerate() {
let base = (x + i) * 4;
out[base] = v;
out[base + 1] = v;
out[base + 2] = v;
out[base + 3] = 0xFF;
}
x += 16;
}
}
if x < width {
scalar::grayf32_to_rgba_row::<BE>(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
let scale = _mm512_set1_ps(65535.0);
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
_mm512_mul_ps(clamped, scale),
_mm512_set1_ps(0.5),
));
let pack16: __m256i = _mm512_cvtusepi32_epi16(int32);
let mut vbuf = [0u16; 16];
_mm256_storeu_si256(vbuf.as_mut_ptr().cast(), pack16);
for (i, &v) in vbuf.iter().enumerate() {
let base = (x + i) * 3;
out[base] = v;
out[base + 1] = v;
out[base + 2] = v;
}
x += 16;
}
}
if x < width {
scalar::grayf32_to_rgb_u16_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
let scale = _mm512_set1_ps(65535.0);
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
_mm512_mul_ps(clamped, scale),
_mm512_set1_ps(0.5),
));
let pack16: __m256i = _mm512_cvtusepi32_epi16(int32);
let mut vbuf = [0u16; 16];
_mm256_storeu_si256(vbuf.as_mut_ptr().cast(), pack16);
for (i, &v) in vbuf.iter().enumerate() {
let base = (x + i) * 4;
out[base] = v;
out[base + 1] = v;
out[base + 2] = v;
out[base + 3] = 0xFFFF;
}
x += 16;
}
}
if x < width {
scalar::grayf32_to_rgba_u16_row::<BE>(
&y_plane[x..width],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[allow(dead_code)] #[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
y_plane: &[f32],
out: &mut [f32],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::grayf32_to_rgb_f32_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let scale = _mm512_set1_ps(255.0);
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
_mm512_mul_ps(clamped, scale),
_mm512_set1_ps(0.5),
));
let pack8: __m128i = _mm512_cvtusepi32_epi8(int32);
_mm_storeu_si128(out.as_mut_ptr().add(x).cast(), pack8);
x += 16;
}
}
if x < width {
scalar::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let scale = _mm512_set1_ps(65535.0);
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
_mm512_mul_ps(clamped, scale),
_mm512_set1_ps(0.5),
));
let pack16: __m256i = _mm512_cvtusepi32_epi16(int32);
_mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), pack16);
x += 16;
}
}
if x < width {
scalar::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)] #[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
y_plane: &[f32],
out: &mut [f32],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
scalar::grayf32_to_luma_f32_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
y_plane: &[f32],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
let scale = _mm512_set1_ps(255.0);
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let y = _mm512_castsi512_ps(load_endian_u32x16::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm512_min_ps(_mm512_max_ps(y, _mm512_setzero_ps()), _mm512_set1_ps(1.0));
let int32 = _mm512_cvttps_epi32(_mm512_add_ps(
_mm512_mul_ps(clamped, scale),
_mm512_set1_ps(0.5),
));
let pack8: __m128i = _mm512_cvtusepi32_epi8(int32);
let zero128 = _mm_setzero_si128();
_mm_storeu_si128(h_out.as_mut_ptr().add(x).cast(), zero128);
_mm_storeu_si128(s_out.as_mut_ptr().add(x).cast(), zero128);
_mm_storeu_si128(v_out.as_mut_ptr().add(x).cast(), pack8);
x += 16;
}
}
if x < width {
scalar::grayf32_to_hsv_row::<BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya8_to_rgb_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 14, 12, 10, 8, 6, 4, 2, 0,
);
while x + 8 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
let y_bytes = _mm_shuffle_epi8(chunk, y_mask);
let val = _mm_cvtsi128_si64(y_bytes) as u64;
let ybuf = val.to_le_bytes();
let base = x * 3;
for i in 0..8usize {
out[base + i * 3] = ybuf[i];
out[base + i * 3 + 1] = ybuf[i];
out[base + i * 3 + 2] = ybuf[i];
}
x += 8;
}
}
if x < width {
scalar::ya8_to_rgb_row(
&packed[x * 2..width * 2],
&mut out[x * 3..width * 3],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya8_to_rgba_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 14, 12, 10, 8, 6, 4, 2, 0,
);
let a_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 15, 13, 11, 9, 7, 5, 3, 1,
);
while x + 8 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
let y_bytes = _mm_shuffle_epi8(chunk, y_mask);
let a_bytes = _mm_shuffle_epi8(chunk, a_mask);
let y_lo = _mm_cvtsi128_si64(y_bytes) as u64;
let a_lo = _mm_cvtsi128_si64(a_bytes) as u64;
let ybuf = y_lo.to_le_bytes();
let abuf = a_lo.to_le_bytes();
let base = x * 4;
for i in 0..8usize {
out[base + i * 4] = ybuf[i];
out[base + i * 4 + 1] = ybuf[i];
out[base + i * 4 + 2] = ybuf[i];
out[base + i * 4 + 3] = abuf[i];
}
x += 8;
}
}
if x < width {
scalar::ya8_to_rgba_row(
&packed[x * 2..width * 2],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya8_to_rgb_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
scalar::ya8_to_rgb_u16_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya8_to_rgba_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
scalar::ya8_to_rgba_u16_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya8_to_luma_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 14, 12, 10, 8, 6, 4, 2, 0,
);
while x + 8 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
let y_bytes = _mm_shuffle_epi8(chunk, y_mask);
let val = _mm_cvtsi128_si64(y_bytes) as u64;
out[x..x + 8].copy_from_slice(&val.to_le_bytes());
x += 8;
}
}
if x < width {
scalar::ya8_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya8_to_luma_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
scalar::ya8_to_luma_u16_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya8_to_hsv_row(
packed: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 14, 12, 10, 8, 6, 4, 2, 0,
);
while x + 8 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
let y_bytes = _mm_shuffle_epi8(chunk, y_mask);
let val = _mm_cvtsi128_si64(y_bytes) as u64;
let vbytes = val.to_le_bytes();
h_out[x..x + 8].fill(0);
s_out[x..x + 8].fill(0);
v_out[x..x + 8].copy_from_slice(&vbytes);
x += 8;
}
}
if x < width {
scalar::ya8_to_hsv_row(
&packed[x * 2..width * 2],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_rgb_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 13, 12, 9, 8, 5, 4, 1, 0,
);
while x + 4 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast::<__m128i>());
let y_words = _mm_shuffle_epi8(chunk, y_mask);
let y_shifted = _mm_srli_epi16(y_words, 8);
let pack8 = _mm_packus_epi16(y_shifted, _mm_setzero_si128());
let val = _mm_cvtsi128_si32(pack8) as u32;
let ybuf = val.to_le_bytes();
let base = x * 3;
for i in 0..4usize {
out[base + i * 3] = ybuf[i];
out[base + i * 3 + 1] = ybuf[i];
out[base + i * 3 + 2] = ybuf[i];
}
x += 4;
}
}
if x < width {
scalar::ya16_to_rgb_row::<BE>(
&packed[x * 2..width * 2],
&mut out[x * 3..width * 3],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
packed: &[u16],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_rgba_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 13, 12, 9, 8, 5, 4, 1, 0,
);
let a_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 15, 14, 11, 10, 7, 6, 3, 2,
);
while x + 4 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast::<__m128i>());
let y_words = _mm_shuffle_epi8(chunk, y_mask);
let a_words = _mm_shuffle_epi8(chunk, a_mask);
let y_shifted = _mm_srli_epi16(y_words, 8);
let a_shifted = _mm_srli_epi16(a_words, 8);
let zero = _mm_setzero_si128();
let y8 = _mm_packus_epi16(y_shifted, zero);
let a8 = _mm_packus_epi16(a_shifted, zero);
let yval = _mm_cvtsi128_si32(y8) as u32;
let aval = _mm_cvtsi128_si32(a8) as u32;
let ybuf = yval.to_le_bytes();
let abuf = aval.to_le_bytes();
let base = x * 4;
for i in 0..4usize {
out[base + i * 4] = ybuf[i];
out[base + i * 4 + 1] = ybuf[i];
out[base + i * 4 + 2] = ybuf[i];
out[base + i * 4 + 3] = abuf[i];
}
x += 4;
}
}
if x < width {
scalar::ya16_to_rgba_row::<BE>(
&packed[x * 2..width * 2],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
packed: &[u16],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_luma_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 13, 12, 9, 8, 5, 4, 1, 0,
);
while x + 4 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast::<__m128i>());
let y_words = _mm_shuffle_epi8(chunk, y_mask);
let y_shifted = _mm_srli_epi16(y_words, 8);
let pack8 = _mm_packus_epi16(y_shifted, _mm_setzero_si128());
let val = _mm_cvtsi128_si32(pack8) as u32;
out[x..x + 4].copy_from_slice(&val.to_le_bytes());
x += 4;
}
}
if x < width {
scalar::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
packed: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width);
}
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 13, 12, 9, 8, 5, 4, 1, 0,
);
while x + 4 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast::<__m128i>());
let y_words = _mm_shuffle_epi8(chunk, y_mask);
let y_shifted = _mm_srli_epi16(y_words, 8);
let pack8 = _mm_packus_epi16(y_shifted, _mm_setzero_si128());
let val = _mm_cvtsi128_si32(pack8) as u32;
let vbytes = val.to_le_bytes();
h_out[x..x + 4].fill(0);
s_out[x..x + 4].fill(0);
v_out[x..x + 4].copy_from_slice(&vbytes);
x += 4;
}
}
if x < width {
scalar::ya16_to_hsv_row::<BE>(
&packed[x * 2..width * 2],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}