#[inline]
pub(super) fn blend_solid_rgb8_scalar(dst: &mut [u8], color: [u8; 3], count: usize) {
debug_assert!(
dst.len() >= count * 3,
"dst too short: {} < {}",
dst.len(),
count * 3
);
for chunk in dst[..count * 3].chunks_exact_mut(3) {
chunk.copy_from_slice(&color);
}
}
#[inline]
pub(super) fn blend_solid_gray8_scalar(dst: &mut [u8], color: u8, count: usize) {
debug_assert!(
dst.len() >= count,
"dst too short: {} < {}",
dst.len(),
count
);
dst[..count].fill(color);
}
#[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
#[target_feature(enable = "avx2")]
unsafe fn blend_solid_rgb8_avx2(dst: &mut [u8], color: [u8; 3], count: usize) {
use std::arch::x86_64::{__m256i, _mm256_loadu_si256, _mm256_storeu_si256};
debug_assert!(
dst.len() >= count * 3,
"dst too short for AVX2 RGB fill: {} < {}",
dst.len(),
count * 3
);
let [r, g, b] = color;
let mut tile = [0u8; 96];
for (i, t) in tile.iter_mut().enumerate() {
*t = match i % 3 {
0 => r,
1 => g,
_ => b,
};
}
let dst_ptr = dst.as_mut_ptr();
let tile_ptr = tile.as_ptr();
let (v0, v1, v2): (__m256i, __m256i, __m256i) = unsafe {
(
_mm256_loadu_si256(tile_ptr.cast()),
_mm256_loadu_si256(tile_ptr.add(32).cast()),
_mm256_loadu_si256(tile_ptr.add(64).cast()),
)
};
let chunks = count / 32;
for i in 0..chunks {
unsafe {
let p = dst_ptr.add(i * 96);
_mm256_storeu_si256(p.cast(), v0);
_mm256_storeu_si256(p.add(32).cast(), v1);
_mm256_storeu_si256(p.add(64).cast(), v2);
}
}
let done = chunks * 32;
blend_solid_rgb8_scalar(&mut dst[done * 3..], color, count - done);
}
#[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
#[target_feature(enable = "avx2")]
unsafe fn blend_solid_gray8_avx2(dst: &mut [u8], color: u8, count: usize) {
use std::arch::x86_64::{_mm256_set1_epi8, _mm256_storeu_si256};
debug_assert!(
dst.len() >= count,
"dst too short for AVX2 gray fill: {} < {}",
dst.len(),
count
);
#[expect(
clippy::cast_possible_wrap,
reason = "reinterpreting byte as i8 for SIMD; bit pattern preserved"
)]
let vec = _mm256_set1_epi8(color as i8);
let dst_ptr = dst.as_mut_ptr();
let chunks = count / 32;
for i in 0..chunks {
unsafe { _mm256_storeu_si256(dst_ptr.add(i * 32).cast(), vec) };
}
let done = chunks * 32;
dst[done..count].fill(color);
}
#[cfg(target_arch = "x86_64")]
const MOVDIR64B_THRESHOLD_PX: usize = 256;
#[cfg(target_arch = "x86_64")]
fn has_movdir64b() -> bool {
use std::sync::OnceLock;
static CACHE: OnceLock<bool> = OnceLock::new();
*CACHE.get_or_init(|| {
let result = std::arch::x86_64::__cpuid_count(7, 0);
(result.ecx >> 28) & 1 != 0
})
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn preamble_len(ptr: *const u8, limit: usize, align: usize) -> usize {
let off = ptr.align_offset(align);
if off == usize::MAX {
limit
} else {
off.min(limit)
}
}
#[cfg(target_arch = "x86_64")]
unsafe fn blend_solid_rgb8_movdir64b(dst: &mut [u8], color: [u8; 3], count: usize) {
#[repr(align(64))]
struct Tile([u8; 192]);
let byte_count = count * 3;
debug_assert!(
dst.len() >= byte_count,
"dst too short for movdir64b RGB fill: {} < {}",
dst.len(),
byte_count,
);
let dst_ptr = dst.as_mut_ptr();
let preamble = preamble_len(dst_ptr.cast_const(), byte_count, 64);
for i in 0..preamble {
dst[i] = color[i % 3];
}
let phase = preamble % 3;
let mut tile = Tile([0u8; 192]);
for (k, t) in tile.0.iter_mut().enumerate() {
*t = color[(phase + k) % 3];
}
let blocks_start = preamble;
debug_assert!(
blocks_start <= byte_count,
"preamble_len exceeded byte_count"
);
let remaining = byte_count - blocks_start;
let blocks = remaining / 192;
for blk in 0..blocks {
unsafe {
let dst_base = dst_ptr.add(blocks_start + blk * 192);
let src0 = tile.0.as_ptr();
let src1 = src0.add(64);
let src2 = src0.add(128);
std::arch::asm!(
"movdir64b {d0}, [{s0}]",
"movdir64b {d1}, [{s1}]",
"movdir64b {d2}, [{s2}]",
d0 = in(reg) dst_base,
d1 = in(reg) dst_base.add(64),
d2 = in(reg) dst_base.add(128),
s0 = in(reg) src0,
s1 = in(reg) src1,
s2 = in(reg) src2,
options(nostack, preserves_flags),
);
}
}
let tail_start = blocks_start + blocks * 192;
for off in tail_start..byte_count {
dst[off] = color[(phase + (off - blocks_start)) % 3];
}
}
#[cfg(target_arch = "x86_64")]
unsafe fn blend_solid_gray8_movdir64b(dst: &mut [u8], color: u8, count: usize) {
#[repr(align(64))]
struct Tile([u8; 64]);
debug_assert!(
dst.len() >= count,
"dst too short for movdir64b gray fill: {} < {}",
dst.len(),
count,
);
let tile = Tile([color; 64]);
let dst_ptr = dst.as_mut_ptr();
let preamble = preamble_len(dst_ptr.cast_const(), count, 64);
dst[..preamble].fill(color);
debug_assert!(preamble <= count, "preamble_len exceeded count");
let blocks = (count - preamble) / 64;
for blk in 0..blocks {
unsafe {
let dst_blk = dst_ptr.add(preamble + blk * 64);
let src = tile.0.as_ptr();
std::arch::asm!(
"movdir64b {dst}, [{src}]",
dst = in(reg) dst_blk,
src = in(reg) src,
options(nostack, preserves_flags),
);
}
}
let tail_start = preamble + blocks * 64;
dst[tail_start..count].fill(color);
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn blend_solid_rgb8_neon(dst: &mut [u8], color: [u8; 3], count: usize) {
use std::arch::aarch64::{uint8x16x3_t, vdupq_n_u8, vst3q_u8};
debug_assert!(
dst.len() >= count * 3,
"dst too short for NEON RGB fill: {} < {}",
dst.len(),
count * 3
);
let [r, g, b] = color;
let vr = vdupq_n_u8(r);
let vg = vdupq_n_u8(g);
let vb = vdupq_n_u8(b);
let chunk = uint8x16x3_t(vr, vg, vb);
let mut px = 0usize;
while px + 16 <= count {
unsafe { vst3q_u8(dst.as_mut_ptr().add(px * 3), chunk) };
px += 16;
}
blend_solid_rgb8_scalar(&mut dst[px * 3..], color, count - px);
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn blend_solid_gray8_neon(dst: &mut [u8], color: u8, count: usize) {
use std::arch::aarch64::{vdupq_n_u8, vst1q_u8};
debug_assert!(
dst.len() >= count,
"dst too short for NEON gray fill: {} < {}",
dst.len(),
count
);
let vec = vdupq_n_u8(color);
let mut px = 0usize;
while px + 16 <= count {
unsafe { vst1q_u8(dst.as_mut_ptr().add(px), vec) };
px += 16;
}
blend_solid_gray8_scalar(&mut dst[px..], color, count - px);
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
if count > MOVDIR64B_THRESHOLD_PX && has_movdir64b() {
unsafe { blend_solid_rgb8_movdir64b(dst, color, count) };
return;
}
#[cfg(feature = "simd-avx2")]
if count >= 32 && is_x86_feature_detected!("avx2") {
unsafe { blend_solid_rgb8_avx2(dst, color, count) };
return;
}
blend_solid_rgb8_scalar(dst, color, count);
}
#[cfg(target_arch = "aarch64")]
#[inline]
fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
if count >= 16 {
unsafe { blend_solid_rgb8_neon(dst, color, count) };
} else {
blend_solid_rgb8_scalar(dst, color, count);
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
#[inline]
fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
blend_solid_rgb8_scalar(dst, color, count);
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
if count > MOVDIR64B_THRESHOLD_PX && has_movdir64b() {
unsafe { blend_solid_gray8_movdir64b(dst, color, count) };
return;
}
#[cfg(feature = "simd-avx2")]
if count >= 32 && is_x86_feature_detected!("avx2") {
unsafe { blend_solid_gray8_avx2(dst, color, count) };
return;
}
blend_solid_gray8_scalar(dst, color, count);
}
#[cfg(target_arch = "aarch64")]
#[inline]
fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
if count >= 16 {
unsafe { blend_solid_gray8_neon(dst, color, count) };
} else {
blend_solid_gray8_scalar(dst, color, count);
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
#[inline]
fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
blend_solid_gray8_scalar(dst, color, count);
}
pub fn blend_solid_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
assert!(
dst.len() >= count * 3,
"blend_solid_rgb8: dst too short ({} < {})",
dst.len(),
count * 3,
);
dispatch_blend_rgb8(dst, color, count);
}
pub fn blend_solid_gray8(dst: &mut [u8], color: u8, count: usize) {
assert!(
dst.len() >= count,
"blend_solid_gray8: dst too short ({} < {})",
dst.len(),
count,
);
dispatch_blend_gray8(dst, color, count);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn scalar_rgb8_small() {
let color = [10u8, 20, 30];
let mut dst = vec![0u8; 9];
blend_solid_rgb8_scalar(&mut dst, color, 3);
assert_eq!(dst, [10, 20, 30, 10, 20, 30, 10, 20, 30]);
}
#[test]
fn scalar_rgb8_zero_count() {
let color = [1u8, 2, 3];
let mut dst = vec![0u8; 3];
blend_solid_rgb8_scalar(&mut dst, color, 0);
assert_eq!(dst, [0, 0, 0]);
}
#[test]
fn scalar_gray8() {
let mut dst = vec![0u8; 5];
blend_solid_gray8_scalar(&mut dst, 42, 5);
assert!(dst.iter().all(|&b| b == 42));
}
#[test]
fn dispatch_rgb8_matches_scalar() {
let color = [100u8, 150, 200];
let count = 64usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
blend_solid_rgb8(&mut got, color, count);
assert_eq!(got, expected, "dispatch_rgb8 mismatch");
}
#[test]
fn dispatch_gray8_matches_scalar() {
let count = 128usize;
let mut expected = vec![0u8; count];
blend_solid_gray8_scalar(&mut expected, 77, count);
let mut got = vec![0u8; count];
blend_solid_gray8(&mut got, 77, count);
assert_eq!(got, expected, "dispatch_gray8 mismatch");
}
#[test]
fn dispatch_rgb8_tail_handled() {
let color = [7u8, 8, 9];
let count = 35usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
blend_solid_rgb8(&mut got, color, count);
assert_eq!(got, expected, "tail mismatch");
}
#[test]
fn dispatch_rgb8_exact_32_pixels() {
let color = [255u8, 0, 128];
let count = 32usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
blend_solid_rgb8(&mut got, color, count);
assert_eq!(got, expected, "exact 32-pixel mismatch");
}
#[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
#[test]
fn avx2_rgb8_matches_scalar_direct() {
if !is_x86_feature_detected!("avx2") {
return;
}
let color = [11u8, 22, 33];
let count = 96usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
unsafe { blend_solid_rgb8_avx2(&mut got, color, count) };
assert_eq!(got, expected, "AVX2 RGB path mismatch");
}
#[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
#[test]
fn avx2_gray8_matches_scalar_direct() {
if !is_x86_feature_detected!("avx2") {
return;
}
let count = 128usize;
let mut expected = vec![0u8; count];
blend_solid_gray8_scalar(&mut expected, 200, count);
let mut got = vec![0u8; count];
unsafe { blend_solid_gray8_avx2(&mut got, 200, count) };
assert_eq!(got, expected, "AVX2 gray path mismatch");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn dispatch_rgb8_large_matches_scalar() {
let color = [77u8, 133, 211];
let count = 384usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
blend_solid_rgb8(&mut got, color, count);
assert_eq!(got, expected, "large RGB dispatch mismatch");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn dispatch_gray8_large_matches_scalar() {
let count = 512usize;
let mut expected = vec![0u8; count];
blend_solid_gray8_scalar(&mut expected, 99, count);
let mut got = vec![0u8; count];
blend_solid_gray8(&mut got, 99, count);
assert_eq!(got, expected, "large gray dispatch mismatch");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn movdir64b_rgb8_matches_scalar() {
if !has_movdir64b() {
return;
}
let color = [11u8, 22, 33];
let count = 512usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
unsafe { blend_solid_rgb8_movdir64b(&mut got, color, count) };
assert_eq!(got, expected, "movdir64b RGB mismatch");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn movdir64b_gray8_matches_scalar() {
if !has_movdir64b() {
return;
}
let count = 512usize;
let mut expected = vec![0u8; count];
blend_solid_gray8_scalar(&mut expected, 200, count);
let mut got = vec![0u8; count];
unsafe { blend_solid_gray8_movdir64b(&mut got, 200, count) };
assert_eq!(got, expected, "movdir64b gray mismatch");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn movdir64b_rgb8_odd_count() {
if !has_movdir64b() {
return;
}
let color = [3u8, 7, 11];
let count = 257usize; let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
unsafe { blend_solid_rgb8_movdir64b(&mut got, color, count) };
assert_eq!(got, expected, "movdir64b RGB odd-count mismatch");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn movdir64b_gray8_odd_count() {
if !has_movdir64b() {
return;
}
let count = 259usize;
let mut expected = vec![0u8; count];
blend_solid_gray8_scalar(&mut expected, 17, count);
let mut got = vec![0u8; count];
unsafe { blend_solid_gray8_movdir64b(&mut got, 17, count) };
assert_eq!(got, expected, "movdir64b gray odd-count mismatch");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_rgb8_exact_16_pixels() {
let color = [11u8, 22, 33];
let count = 16usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
assert_eq!(got, expected, "NEON RGB 16-pixel mismatch");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_rgb8_with_tail() {
let color = [100u8, 150, 200];
let count = 35usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
assert_eq!(got, expected, "NEON RGB tail mismatch");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_rgb8_small_count() {
let color = [7u8, 8, 9];
let count = 5usize;
let mut expected = vec![0u8; count * 3];
blend_solid_rgb8_scalar(&mut expected, color, count);
let mut got = vec![0u8; count * 3];
unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
assert_eq!(got, expected, "NEON RGB small count mismatch");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_gray8_exact_32_pixels() {
let count = 32usize;
let mut expected = vec![0u8; count];
blend_solid_gray8_scalar(&mut expected, 42, count);
let mut got = vec![0u8; count];
unsafe { blend_solid_gray8_neon(&mut got, 42, count) };
assert_eq!(got, expected, "NEON gray 32-pixel mismatch");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_gray8_with_tail() {
let count = 19usize;
let mut expected = vec![0u8; count];
blend_solid_gray8_scalar(&mut expected, 77, count);
let mut got = vec![0u8; count];
unsafe { blend_solid_gray8_neon(&mut got, 77, count) };
assert_eq!(got, expected, "NEON gray tail mismatch");
}
#[test]
fn public_rgb8_zero_count() {
let mut dst = vec![0xFFu8; 6];
blend_solid_rgb8(&mut dst, [1, 2, 3], 0);
assert!(dst.iter().all(|&b| b == 0xFF), "zero-count must not write");
}
#[test]
fn public_gray8_zero_count() {
let mut dst = vec![0xFFu8; 4];
blend_solid_gray8(&mut dst, 42, 0);
assert!(dst.iter().all(|&b| b == 0xFF), "zero-count must not write");
}
#[test]
#[should_panic(expected = "blend_solid_rgb8: dst too short")]
fn rgb8_panics_on_short_dst() {
let mut dst = vec![0u8; 5];
blend_solid_rgb8(&mut dst, [1, 2, 3], 10);
}
#[test]
#[should_panic(expected = "blend_solid_gray8: dst too short")]
fn gray8_panics_on_short_dst() {
let mut dst = vec![0u8; 5];
blend_solid_gray8(&mut dst, 42, 10);
}
}