#![cfg_attr(not(feature = "std"), allow(dead_code))]
use core::arch::x86_64::*;
use crate::row::{
arch::x86_common::{write_rgb_16, write_rgb_u16_8, write_rgba_16, write_rgba_u16_8},
scalar::mono1bit as scalar,
};
#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn unpack_2bytes_sse41<const INVERT: bool>(b0: u8, b1: u8) -> __m128i {
let mask = _mm_set_epi8(
0x01u8 as i8,
0x02u8 as i8,
0x04u8 as i8,
0x08u8 as i8,
0x10u8 as i8,
0x20u8 as i8,
0x40u8 as i8,
0x80u8 as i8,
0x01u8 as i8,
0x02u8 as i8,
0x04u8 as i8,
0x08u8 as i8,
0x10u8 as i8,
0x20u8 as i8,
0x40u8 as i8,
0x80u8 as i8,
);
let bcast = _mm_set_epi8(
b1 as i8, b1 as i8, b1 as i8, b1 as i8, b1 as i8, b1 as i8, b1 as i8, b1 as i8, b0 as i8,
b0 as i8, b0 as i8, b0 as i8, b0 as i8, b0 as i8, b0 as i8, b0 as i8,
);
let anded = _mm_and_si128(bcast, mask);
let zero = _mm_setzero_si128();
let cmp = _mm_cmpeq_epi8(anded, zero);
if INVERT {
cmp
} else {
let all_ones = _mm_set1_epi8(-1i8);
_mm_xor_si128(cmp, all_ones)
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn expand_y_to_u16x8_sse41(y_low8: __m128i) -> __m128i {
let zero = _mm_setzero_si128();
_mm_unpacklo_epi8(y_low8, zero)
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn mono1bit_to_rgb_row<const INVERT: bool>(
data: &[u8],
out: &mut [u8],
width: usize,
) {
debug_assert!(data.len() >= width.div_ceil(8));
debug_assert!(out.len() >= width * 3);
let mut x = 0usize;
let mut byte_idx = 0usize;
unsafe {
while x + 16 <= width {
let y = unpack_2bytes_sse41::<INVERT>(data[byte_idx], data[byte_idx + 1]);
write_rgb_16(y, y, y, out.as_mut_ptr().add(x * 3));
x += 16;
byte_idx += 2;
}
}
if x < width {
if INVERT {
scalar::monowhite_to_rgb_row(&data[byte_idx..], &mut out[x * 3..width * 3], width - x);
} else {
scalar::monoblack_to_rgb_row(&data[byte_idx..], &mut out[x * 3..width * 3], width - x);
}
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn mono1bit_to_rgba_row<const INVERT: bool>(
data: &[u8],
out: &mut [u8],
width: usize,
) {
debug_assert!(data.len() >= width.div_ceil(8));
debug_assert!(out.len() >= width * 4);
let mut x = 0usize;
let mut byte_idx = 0usize;
unsafe {
let alpha = _mm_set1_epi8(-1i8); while x + 16 <= width {
let y = unpack_2bytes_sse41::<INVERT>(data[byte_idx], data[byte_idx + 1]);
write_rgba_16(y, y, y, alpha, out.as_mut_ptr().add(x * 4));
x += 16;
byte_idx += 2;
}
}
if x < width {
if INVERT {
scalar::monowhite_to_rgba_row(&data[byte_idx..], &mut out[x * 4..width * 4], width - x);
} else {
scalar::monoblack_to_rgba_row(&data[byte_idx..], &mut out[x * 4..width * 4], width - x);
}
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn mono1bit_to_luma_row<const INVERT: bool>(
data: &[u8],
out: &mut [u8],
width: usize,
) {
debug_assert!(data.len() >= width.div_ceil(8));
debug_assert!(out.len() >= width);
let mut x = 0usize;
let mut byte_idx = 0usize;
unsafe {
while x + 16 <= width {
let y = unpack_2bytes_sse41::<INVERT>(data[byte_idx], data[byte_idx + 1]);
_mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y);
x += 16;
byte_idx += 2;
}
}
if x < width {
if INVERT {
scalar::monowhite_to_luma_row(&data[byte_idx..], &mut out[x..width], width - x);
} else {
scalar::monoblack_to_luma_row(&data[byte_idx..], &mut out[x..width], width - x);
}
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn mono1bit_to_rgb_u16_row<const INVERT: bool>(
data: &[u8],
out: &mut [u16],
width: usize,
) {
debug_assert!(data.len() >= width.div_ceil(8));
debug_assert!(out.len() >= width * 3);
let mut x = 0usize;
let mut byte_idx = 0usize;
unsafe {
while x + 8 <= width {
let y8_128 = unpack_2bytes_sse41::<INVERT>(data[byte_idx], 0);
let y16 = expand_y_to_u16x8_sse41(y8_128);
write_rgb_u16_8(y16, y16, y16, out.as_mut_ptr().add(x * 3));
x += 8;
byte_idx += 1;
}
}
if x < width {
if INVERT {
scalar::monowhite_to_rgb_u16_row(&data[byte_idx..], &mut out[x * 3..width * 3], width - x);
} else {
scalar::monoblack_to_rgb_u16_row(&data[byte_idx..], &mut out[x * 3..width * 3], width - x);
}
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn mono1bit_to_rgba_u16_row<const INVERT: bool>(
data: &[u8],
out: &mut [u16],
width: usize,
) {
debug_assert!(data.len() >= width.div_ceil(8));
debug_assert!(out.len() >= width * 4);
let mut x = 0usize;
let mut byte_idx = 0usize;
unsafe {
while x + 8 <= width {
let y8_128 = unpack_2bytes_sse41::<INVERT>(data[byte_idx], 0);
let y16 = expand_y_to_u16x8_sse41(y8_128);
let alpha = _mm_set1_epi16(0x00FFu16 as i16);
write_rgba_u16_8(y16, y16, y16, alpha, out.as_mut_ptr().add(x * 4));
x += 8;
byte_idx += 1;
}
}
if x < width {
if INVERT {
scalar::monowhite_to_rgba_u16_row(&data[byte_idx..], &mut out[x * 4..width * 4], width - x);
} else {
scalar::monoblack_to_rgba_u16_row(&data[byte_idx..], &mut out[x * 4..width * 4], width - x);
}
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn mono1bit_to_luma_u16_row<const INVERT: bool>(
data: &[u8],
out: &mut [u16],
width: usize,
) {
debug_assert!(data.len() >= width.div_ceil(8));
debug_assert!(out.len() >= width);
let mut x = 0usize;
let mut byte_idx = 0usize;
unsafe {
while x + 8 <= width {
let y8_128 = unpack_2bytes_sse41::<INVERT>(data[byte_idx], 0);
let y16 = expand_y_to_u16x8_sse41(y8_128);
_mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y16);
x += 8;
byte_idx += 1;
}
}
if x < width {
if INVERT {
scalar::monowhite_to_luma_u16_row(&data[byte_idx..], &mut out[x..width], width - x);
} else {
scalar::monoblack_to_luma_u16_row(&data[byte_idx..], &mut out[x..width], width - x);
}
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn mono1bit_to_hsv_row<const INVERT: bool>(
data: &[u8],
h: &mut [u8],
s: &mut [u8],
v: &mut [u8],
width: usize,
) {
debug_assert!(data.len() >= width.div_ceil(8));
debug_assert!(h.len() >= width);
debug_assert!(s.len() >= width);
debug_assert!(v.len() >= width);
let mut x = 0usize;
let mut byte_idx = 0usize;
unsafe {
let zero = _mm_setzero_si128();
while x + 16 <= width {
let y = unpack_2bytes_sse41::<INVERT>(data[byte_idx], data[byte_idx + 1]);
_mm_storeu_si128(h.as_mut_ptr().add(x).cast(), zero);
_mm_storeu_si128(s.as_mut_ptr().add(x).cast(), zero);
_mm_storeu_si128(v.as_mut_ptr().add(x).cast(), y);
x += 16;
byte_idx += 2;
}
}
if x < width {
if INVERT {
scalar::monowhite_to_hsv_row(
&data[byte_idx..],
&mut h[x..width],
&mut s[x..width],
&mut v[x..width],
width - x,
);
} else {
scalar::monoblack_to_hsv_row(
&data[byte_idx..],
&mut h[x..width],
&mut s[x..width],
&mut v[x..width],
width - x,
);
}
}
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monoblack_to_rgb_row(data: &[u8], out: &mut [u8], width: usize) {
unsafe { mono1bit_to_rgb_row::<false>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monoblack_to_rgba_row(data: &[u8], out: &mut [u8], width: usize) {
unsafe { mono1bit_to_rgba_row::<false>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monoblack_to_rgb_u16_row(data: &[u8], out: &mut [u16], width: usize) {
unsafe { mono1bit_to_rgb_u16_row::<false>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monoblack_to_rgba_u16_row(data: &[u8], out: &mut [u16], width: usize) {
unsafe { mono1bit_to_rgba_u16_row::<false>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monoblack_to_luma_row(data: &[u8], out: &mut [u8], width: usize) {
unsafe { mono1bit_to_luma_row::<false>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monoblack_to_luma_u16_row(data: &[u8], out: &mut [u16], width: usize) {
unsafe { mono1bit_to_luma_u16_row::<false>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monoblack_to_hsv_row(
data: &[u8],
h: &mut [u8],
s: &mut [u8],
v: &mut [u8],
width: usize,
) {
unsafe { mono1bit_to_hsv_row::<false>(data, h, s, v, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monowhite_to_rgb_row(data: &[u8], out: &mut [u8], width: usize) {
unsafe { mono1bit_to_rgb_row::<true>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monowhite_to_rgba_row(data: &[u8], out: &mut [u8], width: usize) {
unsafe { mono1bit_to_rgba_row::<true>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monowhite_to_rgb_u16_row(data: &[u8], out: &mut [u16], width: usize) {
unsafe { mono1bit_to_rgb_u16_row::<true>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monowhite_to_rgba_u16_row(data: &[u8], out: &mut [u16], width: usize) {
unsafe { mono1bit_to_rgba_u16_row::<true>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monowhite_to_luma_row(data: &[u8], out: &mut [u8], width: usize) {
unsafe { mono1bit_to_luma_row::<true>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monowhite_to_luma_u16_row(data: &[u8], out: &mut [u16], width: usize) {
unsafe { mono1bit_to_luma_u16_row::<true>(data, out, width) }
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn monowhite_to_hsv_row(
data: &[u8],
h: &mut [u8],
s: &mut [u8],
v: &mut [u8],
width: usize,
) {
unsafe { mono1bit_to_hsv_row::<true>(data, h, s, v, width) }
}