#[inline]
pub(super) fn unpack_mono_row_scalar(packed: &[u8], width: usize, out: &mut [u8]) {
debug_assert!(out.len() >= width);
debug_assert!(packed.len() >= width.div_ceil(8));
for px in 0..width {
let byte = packed[px / 8];
let bit = 7 - (px % 8);
out[px] = if (byte >> bit) & 1 != 0 { 0xFF } else { 0x00 };
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
unsafe fn expand_two_bytes_sse2(b0: u8, b1: u8, out: &mut [u8]) {
use std::arch::x86_64::{
__m128i, _mm_and_si128, _mm_cmpeq_epi8, _mm_set_epi8, _mm_set1_epi8, _mm_setzero_si128,
_mm_storeu_si128, _mm_xor_si128,
};
debug_assert!(out.len() >= 16);
unsafe {
#[expect(
clippy::cast_possible_wrap,
reason = "reinterpreting byte patterns as i8 for SIMD; bit patterns preserved"
)]
let mask: __m128i = _mm_set_epi8(
0x01u8 as i8,
0x02u8 as i8,
0x04u8 as i8,
0x08u8 as i8,
0x10u8 as i8,
0x20u8 as i8,
0x40u8 as i8,
0x80u8 as i8,
0x01u8 as i8,
0x02u8 as i8,
0x04u8 as i8,
0x08u8 as i8,
0x10u8 as i8,
0x20u8 as i8,
0x40u8 as i8,
0x80u8 as i8,
);
#[expect(
clippy::cast_possible_wrap,
reason = "reinterpreting byte as i8 for SIMD"
)]
let src: __m128i = _mm_set_epi8(
b1 as i8, b1 as i8, b1 as i8, b1 as i8, b1 as i8, b1 as i8, b1 as i8, b1 as i8,
b0 as i8, b0 as i8, b0 as i8, b0 as i8, b0 as i8, b0 as i8, b0 as i8, b0 as i8,
);
let zero = _mm_setzero_si128();
let all_ones = _mm_set1_epi8(-1i8);
let isolated = _mm_and_si128(src, mask);
let eq_zero = _mm_cmpeq_epi8(isolated, zero);
let result = _mm_xor_si128(eq_zero, all_ones);
_mm_storeu_si128(out.as_mut_ptr().cast(), result);
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
unsafe fn unpack_mono_row_sse2(packed: &[u8], width: usize, out: &mut [u8]) {
let mut px = 0usize;
while px + 16 <= width {
let b0 = packed[px / 8];
let b1 = packed[px / 8 + 1];
unsafe { expand_two_bytes_sse2(b0, b1, &mut out[px..]) };
px += 16;
}
if px < width {
unpack_mono_row_scalar(&packed[px / 8..], width - px, &mut out[px..]);
}
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn expand_two_bytes_neon(b0: u8, b1: u8, out: &mut [u8]) {
use std::arch::aarch64::{uint8x16_t, vcombine_u8, vdup_n_u8, vld1q_u8, vst1q_u8, vtstq_u8};
debug_assert!(out.len() >= 16);
let bit_mask: [u8; 16] = [
0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02,
0x01,
];
let mask: uint8x16_t = unsafe { vld1q_u8(bit_mask.as_ptr()) };
let src: uint8x16_t = vcombine_u8(vdup_n_u8(b0), vdup_n_u8(b1));
let result: uint8x16_t = vtstq_u8(src, mask);
unsafe { vst1q_u8(out.as_mut_ptr(), result) };
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn unpack_mono_row_neon(packed: &[u8], width: usize, out: &mut [u8]) {
let mut px = 0usize;
while px + 16 <= width {
let b0 = packed[px / 8];
let b1 = packed[px / 8 + 1];
unsafe { expand_two_bytes_neon(b0, b1, &mut out[px..]) };
px += 16;
}
if px < width {
unpack_mono_row_scalar(&packed[px / 8..], width - px, &mut out[px..]);
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn dispatch_unpack(packed: &[u8], width: usize, out: &mut [u8]) {
if width >= 16 && is_x86_feature_detected!("sse2") {
unsafe { unpack_mono_row_sse2(packed, width, out) };
} else {
unpack_mono_row_scalar(packed, width, out);
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
fn dispatch_unpack(packed: &[u8], width: usize, out: &mut [u8]) {
if width >= 16 {
unsafe { unpack_mono_row_neon(packed, width, out) };
} else {
unpack_mono_row_scalar(packed, width, out);
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
#[inline]
fn dispatch_unpack(packed: &[u8], width: usize, out: &mut [u8]) {
unpack_mono_row_scalar(packed, width, out);
}
pub fn unpack_mono_row(packed: &[u8], width: usize, out: &mut [u8]) {
debug_assert!(out.len() >= width);
debug_assert!(packed.len() >= width.div_ceil(8));
dispatch_unpack(packed, width, out);
}
#[cfg(test)]
mod tests {
use super::*;
fn make_packed(bits: &[u8]) -> Vec<u8> {
let nbytes = bits.len().div_ceil(8);
let mut packed = vec![0u8; nbytes];
for (i, &b) in bits.iter().enumerate() {
if b != 0 {
packed[i / 8] |= 0x80 >> (i % 8);
}
}
packed
}
#[test]
fn scalar_all_zeros() {
let bits = [0u8; 8];
let packed = make_packed(&bits);
let mut out = [1u8; 8];
unpack_mono_row_scalar(&packed, 8, &mut out);
assert!(out.iter().all(|&b| b == 0), "expected all zeros");
}
#[test]
fn scalar_all_ones() {
let bits = [1u8; 8];
let packed = make_packed(&bits);
let mut out = [0u8; 8];
unpack_mono_row_scalar(&packed, 8, &mut out);
assert!(out.iter().all(|&b| b == 0xFF), "expected all 0xFF");
}
#[test]
fn scalar_alternating() {
let packed = [0xAAu8];
let mut out = [0u8; 8];
unpack_mono_row_scalar(&packed, 8, &mut out);
let expected = [0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00];
assert_eq!(out, expected);
}
#[test]
fn scalar_partial_byte() {
let packed = [0xD0u8];
let mut out = [0u8; 5];
unpack_mono_row_scalar(&packed, 5, &mut out);
assert_eq!(out, [0xFF, 0xFF, 0x00, 0xFF, 0x00]);
}
#[test]
fn dispatch_matches_scalar_16_pixels() {
let bits: Vec<u8> = (0..16).map(|i| u8::from(i % 3 == 0)).collect();
let packed = make_packed(&bits);
let mut expected = vec![0u8; 16];
unpack_mono_row_scalar(&packed, 16, &mut expected);
let mut got = vec![0u8; 16];
unpack_mono_row(&packed, 16, &mut got);
assert_eq!(got, expected, "dispatch 16-pixel mismatch");
}
#[test]
fn dispatch_matches_scalar_large() {
let bits: Vec<u8> = (0u8..128).map(|i| u8::from(i % 5 == 0)).collect();
let packed = make_packed(&bits);
let mut expected = vec![0u8; 128];
unpack_mono_row_scalar(&packed, 128, &mut expected);
let mut got = vec![0u8; 128];
unpack_mono_row(&packed, 128, &mut got);
assert_eq!(got, expected, "dispatch large mismatch");
}
#[test]
fn dispatch_matches_scalar_non_multiple() {
for width in [1usize, 7, 9, 15, 17, 23, 33, 63, 65] {
let bits: Vec<u8> = (0..width)
.map(|i| u8::from(i.wrapping_mul(7) % 3 == 0))
.collect();
let packed = make_packed(&bits);
let mut expected = vec![0u8; width];
unpack_mono_row_scalar(&packed, width, &mut expected);
let mut got = vec![0u8; width];
unpack_mono_row(&packed, width, &mut got);
assert_eq!(got, expected, "dispatch mismatch at width={width}");
}
}
#[cfg(target_arch = "x86_64")]
#[test]
fn sse2_expand_two_bytes_known() {
if !is_x86_feature_detected!("sse2") {
return;
}
let mut out = [0u8; 16];
unsafe { expand_two_bytes_sse2(0xAA, 0x55, &mut out) };
let expected = [
0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF,
0x00, 0xFF,
];
assert_eq!(out, expected, "SSE2 two-byte expand mismatch");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn sse2_matches_scalar_random() {
if !is_x86_feature_detected!("sse2") {
return;
}
let packed = [0b1001_1010u8, 0b0110_0101u8, 0b1111_0000u8, 0b0000_1111u8];
let width = 32usize;
let mut expected = vec![0u8; width];
unpack_mono_row_scalar(&packed, width, &mut expected);
let mut got = vec![0u8; width];
unsafe { unpack_mono_row_sse2(&packed, width, &mut got) };
assert_eq!(got, expected, "SSE2 row unpack mismatch");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_expand_two_bytes_known() {
let mut out = [0u8; 16];
unsafe { expand_two_bytes_neon(0xAA, 0x55, &mut out) };
let expected = [
0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF,
0x00, 0xFF,
];
assert_eq!(out, expected, "NEON two-byte expand mismatch");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_matches_scalar_random() {
let packed = [0b1001_1010u8, 0b0110_0101u8, 0b1111_0000u8, 0b0000_1111u8];
let width = 32usize;
let mut expected = vec![0u8; width];
unpack_mono_row_scalar(&packed, width, &mut expected);
let mut got = vec![0u8; width];
unsafe { unpack_mono_row_neon(&packed, width, &mut got) };
assert_eq!(got, expected, "NEON row unpack mismatch");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_boundary_bytes() {
for (b0, b1, desc) in [(0x00u8, 0x00u8, "all-zero"), (0xFFu8, 0xFFu8, "all-one")] {
let mut neon_out = [0u8; 16];
let mut scalar_out = [0u8; 16];
let packed = [b0, b1];
unsafe { expand_two_bytes_neon(b0, b1, &mut neon_out) };
unpack_mono_row_scalar(&packed, 16, &mut scalar_out);
assert_eq!(neon_out, scalar_out, "NEON boundary mismatch ({desc})");
}
}
}