#![cfg_attr(not(feature = "std"), allow(dead_code))]
use core::arch::x86_64::*;
use crate::row::{
arch::x86_avx2::endian::{load_endian_u16x16, load_endian_u32x8},
scalar::{bits_mask, gray as scalar},
};
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray8_to_rgb_row(
y_plane: &[u8],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray8_to_rgb_row(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray8_to_rgba_row(
y_plane: &[u8],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray8_to_rgba_row(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray8_to_hsv_row(
y_plane: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray8_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
}
let mut x = 0usize;
unsafe {
let zero = _mm256_setzero_si256();
while x + 32 <= width {
let v = _mm256_loadu_si256(y_plane.as_ptr().add(x).cast());
_mm256_storeu_si256(h_out.as_mut_ptr().add(x).cast(), zero);
_mm256_storeu_si256(s_out.as_mut_ptr().add(x).cast(), zero);
_mm256_storeu_si256(v_out.as_mut_ptr().add(x).cast(), v);
x += 32;
}
}
if x < width {
scalar::gray8_to_hsv_row(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = _mm256_set1_epi16(mask as i16);
let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
while x + 16 <= width {
let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = _mm256_and_si256(raw, mask_v);
let shifted = _mm256_srl_epi16(masked, shr);
let zero = _mm256_setzero_si256();
let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero));
let lo = _mm256_castsi256_si128(packed);
_mm_storeu_si128(out.as_mut_ptr().add(x).cast(), lo);
x += 16;
}
}
if x < width {
scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = _mm256_set1_epi16(mask as i16);
while x + 16 <= width {
let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = _mm256_and_si256(raw, mask_v);
_mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), masked);
x += 16;
}
}
if x < width {
scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
}
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = _mm256_set1_epi16(mask as i16);
let shr = _mm_cvtsi32_si128((BITS - 8) as i32);
let zero256 = _mm256_setzero_si256();
while x + 16 <= width {
let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = _mm256_and_si256(raw, mask_v);
let shifted = _mm256_srl_epi16(masked, shr);
let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero256));
let lo = _mm256_castsi256_si128(packed);
_mm_storeu_si128(
h_out.as_mut_ptr().add(x).cast(),
_mm256_castsi256_si128(zero256),
);
_mm_storeu_si128(
s_out.as_mut_ptr().add(x).cast(),
_mm256_castsi256_si128(zero256),
);
_mm_storeu_si128(v_out.as_mut_ptr().add(x).cast(), lo);
x += 16;
}
}
if x < width {
scalar::gray_n_to_hsv_row::<BITS, BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
let zero = _mm256_setzero_si256();
while x + 16 <= width {
let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let shifted = _mm256_srli_epi16(raw, 8);
let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero));
let lo = _mm256_castsi256_si128(packed);
_mm_storeu_si128(out.as_mut_ptr().add(x).cast(), lo);
x += 16;
}
}
if x < width {
scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let y = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
_mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y);
x += 16;
}
}
if x < width {
scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
y_plane: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
}
let mut x = 0usize;
unsafe {
let zero = _mm256_setzero_si256();
while x + 16 <= width {
let raw = load_endian_u16x16::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let shifted = _mm256_srli_epi16(raw, 8);
let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(shifted, zero));
let lo = _mm256_castsi256_si128(packed);
_mm_storeu_si128(
h_out.as_mut_ptr().add(x).cast(),
_mm256_castsi256_si128(zero),
);
_mm_storeu_si128(
s_out.as_mut_ptr().add(x).cast(),
_mm256_castsi256_si128(zero),
);
_mm_storeu_si128(v_out.as_mut_ptr().add(x).cast(), lo);
x += 16;
}
}
if x < width {
scalar::gray16_to_hsv_row::<BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
let scale = _mm256_set1_ps(255.0);
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
let scaled = _mm256_mul_ps(clamped, scale);
let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
let lo = _mm256_castsi256_si128(int32);
let hi = _mm256_extracti128_si256::<1>(int32);
let pack16 = _mm_packs_epi32(lo, hi); let pack8 = _mm_packus_epi16(pack16, pack16); let val = _mm_cvtsi128_si64(pack8) as u64;
let ybuf = val.to_le_bytes();
let base = x * 3;
for i in 0..8usize {
out[base + i * 3] = ybuf[i];
out[base + i * 3 + 1] = ybuf[i];
out[base + i * 3 + 2] = ybuf[i];
}
x += 8;
}
}
if x < width {
scalar::grayf32_to_rgb_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
let scale = _mm256_set1_ps(255.0);
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
let scaled = _mm256_mul_ps(clamped, scale);
let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
let lo = _mm256_castsi256_si128(int32);
let hi = _mm256_extracti128_si256::<1>(int32);
let pack16 = _mm_packs_epi32(lo, hi);
let pack8 = _mm_packus_epi16(pack16, pack16);
let val = _mm_cvtsi128_si64(pack8) as u64;
let ybuf = val.to_le_bytes();
let base = x * 4;
for i in 0..8usize {
out[base + i * 4] = ybuf[i];
out[base + i * 4 + 1] = ybuf[i];
out[base + i * 4 + 2] = ybuf[i];
out[base + i * 4 + 3] = 0xFF;
}
x += 8;
}
}
if x < width {
scalar::grayf32_to_rgba_row::<BE>(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
let scale = _mm256_set1_ps(65535.0);
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
let scaled = _mm256_mul_ps(clamped, scale);
let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
let lo = _mm256_castsi256_si128(int32);
let hi = _mm256_extracti128_si256::<1>(int32);
let pack16 = _mm_packus_epi32(lo, hi); let base = x * 3;
let v0 = _mm_extract_epi16::<0>(pack16) as u16;
let v1 = _mm_extract_epi16::<1>(pack16) as u16;
let v2 = _mm_extract_epi16::<2>(pack16) as u16;
let v3 = _mm_extract_epi16::<3>(pack16) as u16;
let v4 = _mm_extract_epi16::<4>(pack16) as u16;
let v5 = _mm_extract_epi16::<5>(pack16) as u16;
let v6 = _mm_extract_epi16::<6>(pack16) as u16;
let v7 = _mm_extract_epi16::<7>(pack16) as u16;
out[base] = v0;
out[base + 1] = v0;
out[base + 2] = v0;
out[base + 3] = v1;
out[base + 4] = v1;
out[base + 5] = v1;
out[base + 6] = v2;
out[base + 7] = v2;
out[base + 8] = v2;
out[base + 9] = v3;
out[base + 10] = v3;
out[base + 11] = v3;
out[base + 12] = v4;
out[base + 13] = v4;
out[base + 14] = v4;
out[base + 15] = v5;
out[base + 16] = v5;
out[base + 17] = v5;
out[base + 18] = v6;
out[base + 19] = v6;
out[base + 20] = v6;
out[base + 21] = v7;
out[base + 22] = v7;
out[base + 23] = v7;
x += 8;
}
}
if x < width {
scalar::grayf32_to_rgb_u16_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
let scale = _mm256_set1_ps(65535.0);
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
let scaled = _mm256_mul_ps(clamped, scale);
let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
let lo = _mm256_castsi256_si128(int32);
let hi = _mm256_extracti128_si256::<1>(int32);
let pack16 = _mm_packus_epi32(lo, hi);
let base = x * 4;
let v0 = _mm_extract_epi16::<0>(pack16) as u16;
let v1 = _mm_extract_epi16::<1>(pack16) as u16;
let v2 = _mm_extract_epi16::<2>(pack16) as u16;
let v3 = _mm_extract_epi16::<3>(pack16) as u16;
let v4 = _mm_extract_epi16::<4>(pack16) as u16;
let v5 = _mm_extract_epi16::<5>(pack16) as u16;
let v6 = _mm_extract_epi16::<6>(pack16) as u16;
let v7 = _mm_extract_epi16::<7>(pack16) as u16;
out[base] = v0;
out[base + 1] = v0;
out[base + 2] = v0;
out[base + 3] = 0xFFFF;
out[base + 4] = v1;
out[base + 5] = v1;
out[base + 6] = v1;
out[base + 7] = 0xFFFF;
out[base + 8] = v2;
out[base + 9] = v2;
out[base + 10] = v2;
out[base + 11] = 0xFFFF;
out[base + 12] = v3;
out[base + 13] = v3;
out[base + 14] = v3;
out[base + 15] = 0xFFFF;
out[base + 16] = v4;
out[base + 17] = v4;
out[base + 18] = v4;
out[base + 19] = 0xFFFF;
out[base + 20] = v5;
out[base + 21] = v5;
out[base + 22] = v5;
out[base + 23] = 0xFFFF;
out[base + 24] = v6;
out[base + 25] = v6;
out[base + 26] = v6;
out[base + 27] = 0xFFFF;
out[base + 28] = v7;
out[base + 29] = v7;
out[base + 30] = v7;
out[base + 31] = 0xFFFF;
x += 8;
}
}
if x < width {
scalar::grayf32_to_rgba_u16_row::<BE>(
&y_plane[x..width],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[allow(dead_code)] #[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
y_plane: &[f32],
out: &mut [f32],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::grayf32_to_rgb_f32_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let scale = _mm256_set1_ps(255.0);
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
let scaled = _mm256_mul_ps(clamped, scale);
let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
let lo = _mm256_castsi256_si128(int32);
let hi = _mm256_extracti128_si256::<1>(int32);
let pack16 = _mm_packs_epi32(lo, hi);
let pack8 = _mm_packus_epi16(pack16, pack16);
let val = _mm_cvtsi128_si64(pack8) as u64;
out[x..x + 8].copy_from_slice(&val.to_le_bytes());
x += 8;
}
}
if x < width {
scalar::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let scale = _mm256_set1_ps(65535.0);
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
let scaled = _mm256_mul_ps(clamped, scale);
let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
let lo = _mm256_castsi256_si128(int32);
let hi = _mm256_extracti128_si256::<1>(int32);
let pack16 = _mm_packus_epi32(lo, hi); _mm_storeu_si128(out.as_mut_ptr().add(x).cast::<__m128i>(), pack16);
x += 8;
}
}
if x < width {
scalar::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)] #[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
y_plane: &[f32],
out: &mut [f32],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
scalar::grayf32_to_luma_f32_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
y_plane: &[f32],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
let scale = _mm256_set1_ps(255.0);
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = _mm256_castsi256_ps(load_endian_u32x8::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let clamped = _mm256_min_ps(_mm256_max_ps(y, zero), one);
let scaled = _mm256_mul_ps(clamped, scale);
let int32 = _mm256_cvttps_epi32(_mm256_add_ps(scaled, _mm256_set1_ps(0.5)));
let lo = _mm256_castsi256_si128(int32);
let hi = _mm256_extracti128_si256::<1>(int32);
let pack16 = _mm_packs_epi32(lo, hi);
let pack8 = _mm_packus_epi16(pack16, pack16);
let val = _mm_cvtsi128_si64(pack8) as u64;
let vbytes = val.to_le_bytes();
h_out[x..x + 8].fill(0);
s_out[x..x + 8].fill(0);
v_out[x..x + 8].copy_from_slice(&vbytes);
x += 8;
}
}
if x < width {
scalar::grayf32_to_hsv_row::<BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya8_to_rgb_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 14, 12, 10, 8, 6, 4, 2, 0,
);
while x + 8 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
let y_bytes = _mm_shuffle_epi8(chunk, y_mask);
let val = _mm_cvtsi128_si64(y_bytes) as u64;
let ybuf = val.to_le_bytes();
let base = x * 3;
for i in 0..8usize {
out[base + i * 3] = ybuf[i];
out[base + i * 3 + 1] = ybuf[i];
out[base + i * 3 + 2] = ybuf[i];
}
x += 8;
}
}
if x < width {
scalar::ya8_to_rgb_row(
&packed[x * 2..width * 2],
&mut out[x * 3..width * 3],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya8_to_rgba_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 14, 12, 10, 8, 6, 4, 2, 0,
);
let a_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 15, 13, 11, 9, 7, 5, 3, 1,
);
while x + 8 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
let y_bytes = _mm_shuffle_epi8(chunk, y_mask);
let a_bytes = _mm_shuffle_epi8(chunk, a_mask);
let y_lo = _mm_cvtsi128_si64(y_bytes) as u64;
let a_lo = _mm_cvtsi128_si64(a_bytes) as u64;
let ybuf = y_lo.to_le_bytes();
let abuf = a_lo.to_le_bytes();
let base = x * 4;
for i in 0..8usize {
out[base + i * 4] = ybuf[i];
out[base + i * 4 + 1] = ybuf[i];
out[base + i * 4 + 2] = ybuf[i];
out[base + i * 4 + 3] = abuf[i];
}
x += 8;
}
}
if x < width {
scalar::ya8_to_rgba_row(
&packed[x * 2..width * 2],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya8_to_rgb_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
scalar::ya8_to_rgb_u16_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya8_to_rgba_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
scalar::ya8_to_rgba_u16_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya8_to_luma_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 14, 12, 10, 8, 6, 4, 2, 0,
);
while x + 8 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
let y_bytes = _mm_shuffle_epi8(chunk, y_mask);
let val = _mm_cvtsi128_si64(y_bytes) as u64;
out[x..x + 8].copy_from_slice(&val.to_le_bytes());
x += 8;
}
}
if x < width {
scalar::ya8_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya8_to_luma_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
scalar::ya8_to_luma_u16_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya8_to_hsv_row(
packed: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 14, 12, 10, 8, 6, 4, 2, 0,
);
while x + 8 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast());
let y_bytes = _mm_shuffle_epi8(chunk, y_mask);
let val = _mm_cvtsi128_si64(y_bytes) as u64;
let vbytes = val.to_le_bytes();
h_out[x..x + 8].fill(0);
s_out[x..x + 8].fill(0);
v_out[x..x + 8].copy_from_slice(&vbytes);
x += 8;
}
}
if x < width {
scalar::ya8_to_hsv_row(
&packed[x * 2..width * 2],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_rgb_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 13, 12, 9, 8, 5, 4, 1, 0,
);
while x + 4 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast::<__m128i>());
let y_words = _mm_shuffle_epi8(chunk, y_mask);
let y_shifted = _mm_srli_epi16(y_words, 8);
let pack8 = _mm_packus_epi16(y_shifted, _mm_setzero_si128());
let val = _mm_cvtsi128_si32(pack8) as u32;
let ybuf = val.to_le_bytes();
let base = x * 3;
for i in 0..4usize {
out[base + i * 3] = ybuf[i];
out[base + i * 3 + 1] = ybuf[i];
out[base + i * 3 + 2] = ybuf[i];
}
x += 4;
}
}
if x < width {
scalar::ya16_to_rgb_row::<BE>(
&packed[x * 2..width * 2],
&mut out[x * 3..width * 3],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
packed: &[u16],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_rgba_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 13, 12, 9, 8, 5, 4, 1, 0,
);
let a_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 15, 14, 11, 10, 7, 6, 3, 2,
);
while x + 4 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast::<__m128i>());
let y_words = _mm_shuffle_epi8(chunk, y_mask);
let a_words = _mm_shuffle_epi8(chunk, a_mask);
let y_shifted = _mm_srli_epi16(y_words, 8);
let a_shifted = _mm_srli_epi16(a_words, 8);
let zero = _mm_setzero_si128();
let y8 = _mm_packus_epi16(y_shifted, zero);
let a8 = _mm_packus_epi16(a_shifted, zero);
let yval = _mm_cvtsi128_si32(y8) as u32;
let aval = _mm_cvtsi128_si32(a8) as u32;
let ybuf = yval.to_le_bytes();
let abuf = aval.to_le_bytes();
let base = x * 4;
for i in 0..4usize {
out[base + i * 4] = ybuf[i];
out[base + i * 4 + 1] = ybuf[i];
out[base + i * 4 + 2] = ybuf[i];
out[base + i * 4 + 3] = abuf[i];
}
x += 4;
}
}
if x < width {
scalar::ya16_to_rgba_row::<BE>(
&packed[x * 2..width * 2],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
packed: &[u16],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_luma_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 13, 12, 9, 8, 5, 4, 1, 0,
);
while x + 4 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast::<__m128i>());
let y_words = _mm_shuffle_epi8(chunk, y_mask);
let y_shifted = _mm_srli_epi16(y_words, 8);
let pack8 = _mm_packus_epi16(y_shifted, _mm_setzero_si128());
let val = _mm_cvtsi128_si32(pack8) as u32;
out[x..x + 4].copy_from_slice(&val.to_le_bytes());
x += 4;
}
}
if x < width {
scalar::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
packed: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width);
}
let mut x = 0usize;
unsafe {
let y_mask = _mm_set_epi8(
-128, -128, -128, -128, -128, -128, -128, -128, 13, 12, 9, 8, 5, 4, 1, 0,
);
while x + 4 <= width {
let chunk = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast::<__m128i>());
let y_words = _mm_shuffle_epi8(chunk, y_mask);
let y_shifted = _mm_srli_epi16(y_words, 8);
let pack8 = _mm_packus_epi16(y_shifted, _mm_setzero_si128());
let val = _mm_cvtsi128_si32(pack8) as u32;
let vbytes = val.to_le_bytes();
h_out[x..x + 4].fill(0);
s_out[x..x + 4].fill(0);
v_out[x..x + 4].copy_from_slice(&vbytes);
x += 4;
}
}
if x < width {
scalar::ya16_to_hsv_row::<BE>(
&packed[x * 2..width * 2],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}