1#[inline]
32pub(super) fn blend_solid_rgb8_scalar(dst: &mut [u8], color: [u8; 3], count: usize) {
33 debug_assert!(
34 dst.len() >= count * 3,
35 "dst too short: {} < {}",
36 dst.len(),
37 count * 3
38 );
39 for chunk in dst[..count * 3].chunks_exact_mut(3) {
40 chunk.copy_from_slice(&color);
41 }
42}
43
44#[inline]
46pub(super) fn blend_solid_gray8_scalar(dst: &mut [u8], color: u8, count: usize) {
47 debug_assert!(
48 dst.len() >= count,
49 "dst too short: {} < {}",
50 dst.len(),
51 count
52 );
53 dst[..count].fill(color);
54}
55
56#[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
60#[target_feature(enable = "avx2")]
67unsafe fn blend_solid_rgb8_avx2(dst: &mut [u8], color: [u8; 3], count: usize) {
68 use std::arch::x86_64::{__m256i, _mm256_loadu_si256, _mm256_storeu_si256};
69 debug_assert!(
70 dst.len() >= count * 3,
71 "dst too short for AVX2 RGB fill: {} < {}",
72 dst.len(),
73 count * 3
74 );
75
76 let [r, g, b] = color;
77 let mut tile = [0u8; 96];
81 for (i, t) in tile.iter_mut().enumerate() {
82 *t = match i % 3 {
83 0 => r,
84 1 => g,
85 _ => b,
86 };
87 }
88
89 let dst_ptr = dst.as_mut_ptr();
90 let tile_ptr = tile.as_ptr();
91
92 let (v0, v1, v2): (__m256i, __m256i, __m256i) = unsafe {
96 (
97 _mm256_loadu_si256(tile_ptr.cast()),
98 _mm256_loadu_si256(tile_ptr.add(32).cast()),
99 _mm256_loadu_si256(tile_ptr.add(64).cast()),
100 )
101 };
102
103 let chunks = count / 32;
105 for i in 0..chunks {
106 unsafe {
108 let p = dst_ptr.add(i * 96);
109 _mm256_storeu_si256(p.cast(), v0);
110 _mm256_storeu_si256(p.add(32).cast(), v1);
111 _mm256_storeu_si256(p.add(64).cast(), v2);
112 }
113 }
114
115 let done = chunks * 32;
117 blend_solid_rgb8_scalar(&mut dst[done * 3..], color, count - done);
118}
119
120#[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
121#[target_feature(enable = "avx2")]
128unsafe fn blend_solid_gray8_avx2(dst: &mut [u8], color: u8, count: usize) {
129 use std::arch::x86_64::{_mm256_set1_epi8, _mm256_storeu_si256};
130 debug_assert!(
131 dst.len() >= count,
132 "dst too short for AVX2 gray fill: {} < {}",
133 dst.len(),
134 count
135 );
136
137 #[expect(
138 clippy::cast_possible_wrap,
139 reason = "reinterpreting byte as i8 for SIMD; bit pattern preserved"
140 )]
141 let vec = _mm256_set1_epi8(color as i8);
142 let dst_ptr = dst.as_mut_ptr();
143
144 let chunks = count / 32;
145 for i in 0..chunks {
146 unsafe { _mm256_storeu_si256(dst_ptr.add(i * 32).cast(), vec) };
148 }
149
150 let done = chunks * 32;
152 dst[done..count].fill(color);
153}
154
155#[cfg(target_arch = "x86_64")]
161const MOVDIR64B_THRESHOLD_PX: usize = 256;
168
169#[cfg(target_arch = "x86_64")]
170fn has_movdir64b() -> bool {
176 use std::sync::OnceLock;
177 static CACHE: OnceLock<bool> = OnceLock::new();
178 *CACHE.get_or_init(|| {
179 let result = std::arch::x86_64::__cpuid_count(7, 0);
182 (result.ecx >> 28) & 1 != 0
184 })
185}
186
187#[cfg(target_arch = "x86_64")]
194#[inline]
195fn preamble_len(ptr: *const u8, limit: usize, align: usize) -> usize {
196 let off = ptr.align_offset(align);
197 if off == usize::MAX {
198 limit
199 } else {
200 off.min(limit)
201 }
202}
203
204#[cfg(target_arch = "x86_64")]
205unsafe fn blend_solid_rgb8_movdir64b(dst: &mut [u8], color: [u8; 3], count: usize) {
220 #[repr(align(64))]
223 struct Tile([u8; 192]);
224
225 let byte_count = count * 3;
228 debug_assert!(
229 dst.len() >= byte_count,
230 "dst too short for movdir64b RGB fill: {} < {}",
231 dst.len(),
232 byte_count,
233 );
234 let dst_ptr = dst.as_mut_ptr();
235
236 let preamble = preamble_len(dst_ptr.cast_const(), byte_count, 64);
241 for i in 0..preamble {
242 dst[i] = color[i % 3];
243 }
244
245 let phase = preamble % 3;
250 let mut tile = Tile([0u8; 192]);
251 for (k, t) in tile.0.iter_mut().enumerate() {
252 *t = color[(phase + k) % 3];
253 }
254
255 let blocks_start = preamble;
256 debug_assert!(
257 blocks_start <= byte_count,
258 "preamble_len exceeded byte_count"
259 );
260 let remaining = byte_count - blocks_start;
261 let blocks = remaining / 192;
262
263 for blk in 0..blocks {
264 unsafe {
274 let dst_base = dst_ptr.add(blocks_start + blk * 192);
275 let src0 = tile.0.as_ptr();
276 let src1 = src0.add(64);
277 let src2 = src0.add(128);
278 std::arch::asm!(
279 "movdir64b {d0}, [{s0}]",
280 "movdir64b {d1}, [{s1}]",
281 "movdir64b {d2}, [{s2}]",
282 d0 = in(reg) dst_base,
283 d1 = in(reg) dst_base.add(64),
284 d2 = in(reg) dst_base.add(128),
285 s0 = in(reg) src0,
286 s1 = in(reg) src1,
287 s2 = in(reg) src2,
288 options(nostack, preserves_flags),
289 );
290 }
291 }
292
293 let tail_start = blocks_start + blocks * 192;
298 for off in tail_start..byte_count {
299 dst[off] = color[(phase + (off - blocks_start)) % 3];
300 }
301}
302
303#[cfg(target_arch = "x86_64")]
304unsafe fn blend_solid_gray8_movdir64b(dst: &mut [u8], color: u8, count: usize) {
314 #[repr(align(64))]
315 struct Tile([u8; 64]);
316
317 debug_assert!(
318 dst.len() >= count,
319 "dst too short for movdir64b gray fill: {} < {}",
320 dst.len(),
321 count,
322 );
323
324 let tile = Tile([color; 64]);
325 let dst_ptr = dst.as_mut_ptr();
326
327 let preamble = preamble_len(dst_ptr.cast_const(), count, 64);
329 dst[..preamble].fill(color);
330
331 debug_assert!(preamble <= count, "preamble_len exceeded count");
333 let blocks = (count - preamble) / 64;
334 for blk in 0..blocks {
335 unsafe {
342 let dst_blk = dst_ptr.add(preamble + blk * 64);
343 let src = tile.0.as_ptr();
344 std::arch::asm!(
345 "movdir64b {dst}, [{src}]",
346 dst = in(reg) dst_blk,
347 src = in(reg) src,
348 options(nostack, preserves_flags),
349 );
350 }
351 }
352
353 let tail_start = preamble + blocks * 64;
355 dst[tail_start..count].fill(color);
356}
357
358#[cfg(target_arch = "aarch64")]
370#[target_feature(enable = "neon")]
371unsafe fn blend_solid_rgb8_neon(dst: &mut [u8], color: [u8; 3], count: usize) {
372 use std::arch::aarch64::{uint8x16x3_t, vdupq_n_u8, vst3q_u8};
373
374 debug_assert!(
375 dst.len() >= count * 3,
376 "dst too short for NEON RGB fill: {} < {}",
377 dst.len(),
378 count * 3
379 );
380
381 let [r, g, b] = color;
382 let vr = vdupq_n_u8(r);
384 let vg = vdupq_n_u8(g);
385 let vb = vdupq_n_u8(b);
386 let chunk = uint8x16x3_t(vr, vg, vb);
387
388 let mut px = 0usize;
389 while px + 16 <= count {
390 unsafe { vst3q_u8(dst.as_mut_ptr().add(px * 3), chunk) };
392 px += 16;
393 }
394 blend_solid_rgb8_scalar(&mut dst[px * 3..], color, count - px);
396}
397
398#[cfg(target_arch = "aarch64")]
404#[target_feature(enable = "neon")]
405unsafe fn blend_solid_gray8_neon(dst: &mut [u8], color: u8, count: usize) {
406 use std::arch::aarch64::{vdupq_n_u8, vst1q_u8};
407
408 debug_assert!(
409 dst.len() >= count,
410 "dst too short for NEON gray fill: {} < {}",
411 dst.len(),
412 count
413 );
414
415 let vec = vdupq_n_u8(color);
416
417 let mut px = 0usize;
418 while px + 16 <= count {
419 unsafe { vst1q_u8(dst.as_mut_ptr().add(px), vec) };
421 px += 16;
422 }
423 blend_solid_gray8_scalar(&mut dst[px..], color, count - px);
425}
426
427#[cfg(target_arch = "x86_64")]
430#[inline]
431fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
432 if count > MOVDIR64B_THRESHOLD_PX && has_movdir64b() {
433 unsafe { blend_solid_rgb8_movdir64b(dst, color, count) };
435 return;
436 }
437 #[cfg(feature = "simd-avx2")]
438 if count >= 32 && is_x86_feature_detected!("avx2") {
439 unsafe { blend_solid_rgb8_avx2(dst, color, count) };
441 return;
442 }
443 blend_solid_rgb8_scalar(dst, color, count);
444}
445
446#[cfg(target_arch = "aarch64")]
447#[inline]
448fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
449 if count >= 16 {
450 unsafe { blend_solid_rgb8_neon(dst, color, count) };
452 } else {
453 blend_solid_rgb8_scalar(dst, color, count);
454 }
455}
456
457#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
458#[inline]
459fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
460 blend_solid_rgb8_scalar(dst, color, count);
461}
462
463#[cfg(target_arch = "x86_64")]
464#[inline]
465fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
466 if count > MOVDIR64B_THRESHOLD_PX && has_movdir64b() {
467 unsafe { blend_solid_gray8_movdir64b(dst, color, count) };
469 return;
470 }
471 #[cfg(feature = "simd-avx2")]
472 if count >= 32 && is_x86_feature_detected!("avx2") {
473 unsafe { blend_solid_gray8_avx2(dst, color, count) };
475 return;
476 }
477 blend_solid_gray8_scalar(dst, color, count);
478}
479
480#[cfg(target_arch = "aarch64")]
481#[inline]
482fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
483 if count >= 16 {
484 unsafe { blend_solid_gray8_neon(dst, color, count) };
486 } else {
487 blend_solid_gray8_scalar(dst, color, count);
488 }
489}
490
491#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
492#[inline]
493fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
494 blend_solid_gray8_scalar(dst, color, count);
495}
496
497pub fn blend_solid_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
511 assert!(
512 dst.len() >= count * 3,
513 "blend_solid_rgb8: dst too short ({} < {})",
514 dst.len(),
515 count * 3,
516 );
517 dispatch_blend_rgb8(dst, color, count);
518}
519
520pub fn blend_solid_gray8(dst: &mut [u8], color: u8, count: usize) {
532 assert!(
533 dst.len() >= count,
534 "blend_solid_gray8: dst too short ({} < {})",
535 dst.len(),
536 count,
537 );
538 dispatch_blend_gray8(dst, color, count);
539}
540
541#[cfg(test)]
544mod tests {
545 use super::*;
546
547 #[test]
550 fn scalar_rgb8_small() {
551 let color = [10u8, 20, 30];
552 let mut dst = vec![0u8; 9];
553 blend_solid_rgb8_scalar(&mut dst, color, 3);
554 assert_eq!(dst, [10, 20, 30, 10, 20, 30, 10, 20, 30]);
555 }
556
557 #[test]
558 fn scalar_rgb8_zero_count() {
559 let color = [1u8, 2, 3];
560 let mut dst = vec![0u8; 3];
561 blend_solid_rgb8_scalar(&mut dst, color, 0);
562 assert_eq!(dst, [0, 0, 0]);
563 }
564
565 #[test]
566 fn scalar_gray8() {
567 let mut dst = vec![0u8; 5];
568 blend_solid_gray8_scalar(&mut dst, 42, 5);
569 assert!(dst.iter().all(|&b| b == 42));
570 }
571
572 #[test]
575 fn dispatch_rgb8_matches_scalar() {
576 let color = [100u8, 150, 200];
577 let count = 64usize;
579 let mut expected = vec![0u8; count * 3];
580 blend_solid_rgb8_scalar(&mut expected, color, count);
581
582 let mut got = vec![0u8; count * 3];
583 blend_solid_rgb8(&mut got, color, count);
584 assert_eq!(got, expected, "dispatch_rgb8 mismatch");
585 }
586
587 #[test]
588 fn dispatch_gray8_matches_scalar() {
589 let count = 128usize;
590 let mut expected = vec![0u8; count];
591 blend_solid_gray8_scalar(&mut expected, 77, count);
592
593 let mut got = vec![0u8; count];
594 blend_solid_gray8(&mut got, 77, count);
595 assert_eq!(got, expected, "dispatch_gray8 mismatch");
596 }
597
598 #[test]
599 fn dispatch_rgb8_tail_handled() {
600 let color = [7u8, 8, 9];
602 let count = 35usize;
603 let mut expected = vec![0u8; count * 3];
604 blend_solid_rgb8_scalar(&mut expected, color, count);
605
606 let mut got = vec![0u8; count * 3];
607 blend_solid_rgb8(&mut got, color, count);
608 assert_eq!(got, expected, "tail mismatch");
609 }
610
611 #[test]
612 fn dispatch_rgb8_exact_32_pixels() {
613 let color = [255u8, 0, 128];
614 let count = 32usize;
615 let mut expected = vec![0u8; count * 3];
616 blend_solid_rgb8_scalar(&mut expected, color, count);
617
618 let mut got = vec![0u8; count * 3];
619 blend_solid_rgb8(&mut got, color, count);
620 assert_eq!(got, expected, "exact 32-pixel mismatch");
621 }
622
623 #[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
625 #[test]
626 fn avx2_rgb8_matches_scalar_direct() {
627 if !is_x86_feature_detected!("avx2") {
628 return;
629 }
630 let color = [11u8, 22, 33];
631 let count = 96usize;
632 let mut expected = vec![0u8; count * 3];
633 blend_solid_rgb8_scalar(&mut expected, color, count);
634
635 let mut got = vec![0u8; count * 3];
636 unsafe { blend_solid_rgb8_avx2(&mut got, color, count) };
638 assert_eq!(got, expected, "AVX2 RGB path mismatch");
639 }
640
641 #[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
642 #[test]
643 fn avx2_gray8_matches_scalar_direct() {
644 if !is_x86_feature_detected!("avx2") {
645 return;
646 }
647 let count = 128usize;
648 let mut expected = vec![0u8; count];
649 blend_solid_gray8_scalar(&mut expected, 200, count);
650
651 let mut got = vec![0u8; count];
652 unsafe { blend_solid_gray8_avx2(&mut got, 200, count) };
654 assert_eq!(got, expected, "AVX2 gray path mismatch");
655 }
656
657 #[cfg(target_arch = "x86_64")]
662 #[test]
663 fn dispatch_rgb8_large_matches_scalar() {
664 let color = [77u8, 133, 211];
665 let count = 384usize;
667 let mut expected = vec![0u8; count * 3];
668 blend_solid_rgb8_scalar(&mut expected, color, count);
669
670 let mut got = vec![0u8; count * 3];
671 blend_solid_rgb8(&mut got, color, count);
672 assert_eq!(got, expected, "large RGB dispatch mismatch");
673 }
674
675 #[cfg(target_arch = "x86_64")]
676 #[test]
677 fn dispatch_gray8_large_matches_scalar() {
678 let count = 512usize;
679 let mut expected = vec![0u8; count];
680 blend_solid_gray8_scalar(&mut expected, 99, count);
681
682 let mut got = vec![0u8; count];
683 blend_solid_gray8(&mut got, 99, count);
684 assert_eq!(got, expected, "large gray dispatch mismatch");
685 }
686
687 #[cfg(target_arch = "x86_64")]
691 #[test]
692 fn movdir64b_rgb8_matches_scalar() {
693 if !has_movdir64b() {
694 return;
695 }
696 let color = [11u8, 22, 33];
697 let count = 512usize;
699 let mut expected = vec![0u8; count * 3];
700 blend_solid_rgb8_scalar(&mut expected, color, count);
701
702 let mut got = vec![0u8; count * 3];
703 unsafe { blend_solid_rgb8_movdir64b(&mut got, color, count) };
705 assert_eq!(got, expected, "movdir64b RGB mismatch");
706 }
707
708 #[cfg(target_arch = "x86_64")]
709 #[test]
710 fn movdir64b_gray8_matches_scalar() {
711 if !has_movdir64b() {
712 return;
713 }
714 let count = 512usize;
715 let mut expected = vec![0u8; count];
716 blend_solid_gray8_scalar(&mut expected, 200, count);
717
718 let mut got = vec![0u8; count];
719 unsafe { blend_solid_gray8_movdir64b(&mut got, 200, count) };
721 assert_eq!(got, expected, "movdir64b gray mismatch");
722 }
723
724 #[cfg(target_arch = "x86_64")]
726 #[test]
727 fn movdir64b_rgb8_odd_count() {
728 if !has_movdir64b() {
729 return;
730 }
731 let color = [3u8, 7, 11];
732 let count = 257usize; let mut expected = vec![0u8; count * 3];
734 blend_solid_rgb8_scalar(&mut expected, color, count);
735
736 let mut got = vec![0u8; count * 3];
737 unsafe { blend_solid_rgb8_movdir64b(&mut got, color, count) };
739 assert_eq!(got, expected, "movdir64b RGB odd-count mismatch");
740 }
741
742 #[cfg(target_arch = "x86_64")]
743 #[test]
744 fn movdir64b_gray8_odd_count() {
745 if !has_movdir64b() {
746 return;
747 }
748 let count = 259usize;
749 let mut expected = vec![0u8; count];
750 blend_solid_gray8_scalar(&mut expected, 17, count);
751
752 let mut got = vec![0u8; count];
753 unsafe { blend_solid_gray8_movdir64b(&mut got, 17, count) };
755 assert_eq!(got, expected, "movdir64b gray odd-count mismatch");
756 }
757
758 #[cfg(target_arch = "aarch64")]
762 #[test]
763 fn neon_rgb8_exact_16_pixels() {
764 let color = [11u8, 22, 33];
765 let count = 16usize;
766 let mut expected = vec![0u8; count * 3];
767 blend_solid_rgb8_scalar(&mut expected, color, count);
768 let mut got = vec![0u8; count * 3];
769 unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
771 assert_eq!(got, expected, "NEON RGB 16-pixel mismatch");
772 }
773
774 #[cfg(target_arch = "aarch64")]
776 #[test]
777 fn neon_rgb8_with_tail() {
778 let color = [100u8, 150, 200];
779 let count = 35usize;
780 let mut expected = vec![0u8; count * 3];
781 blend_solid_rgb8_scalar(&mut expected, color, count);
782 let mut got = vec![0u8; count * 3];
783 unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
785 assert_eq!(got, expected, "NEON RGB tail mismatch");
786 }
787
788 #[cfg(target_arch = "aarch64")]
790 #[test]
791 fn neon_rgb8_small_count() {
792 let color = [7u8, 8, 9];
793 let count = 5usize;
794 let mut expected = vec![0u8; count * 3];
795 blend_solid_rgb8_scalar(&mut expected, color, count);
796 let mut got = vec![0u8; count * 3];
797 unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
799 assert_eq!(got, expected, "NEON RGB small count mismatch");
800 }
801
802 #[cfg(target_arch = "aarch64")]
804 #[test]
805 fn neon_gray8_exact_32_pixels() {
806 let count = 32usize;
807 let mut expected = vec![0u8; count];
808 blend_solid_gray8_scalar(&mut expected, 42, count);
809 let mut got = vec![0u8; count];
810 unsafe { blend_solid_gray8_neon(&mut got, 42, count) };
812 assert_eq!(got, expected, "NEON gray 32-pixel mismatch");
813 }
814
815 #[cfg(target_arch = "aarch64")]
817 #[test]
818 fn neon_gray8_with_tail() {
819 let count = 19usize;
820 let mut expected = vec![0u8; count];
821 blend_solid_gray8_scalar(&mut expected, 77, count);
822 let mut got = vec![0u8; count];
823 unsafe { blend_solid_gray8_neon(&mut got, 77, count) };
825 assert_eq!(got, expected, "NEON gray tail mismatch");
826 }
827
828 #[test]
831 fn public_rgb8_zero_count() {
832 let mut dst = vec![0xFFu8; 6];
833 blend_solid_rgb8(&mut dst, [1, 2, 3], 0);
834 assert!(dst.iter().all(|&b| b == 0xFF), "zero-count must not write");
835 }
836
837 #[test]
838 fn public_gray8_zero_count() {
839 let mut dst = vec![0xFFu8; 4];
840 blend_solid_gray8(&mut dst, 42, 0);
841 assert!(dst.iter().all(|&b| b == 0xFF), "zero-count must not write");
842 }
843
844 #[test]
845 #[should_panic(expected = "blend_solid_rgb8: dst too short")]
846 fn rgb8_panics_on_short_dst() {
847 let mut dst = vec![0u8; 5];
848 blend_solid_rgb8(&mut dst, [1, 2, 3], 10);
849 }
850
851 #[test]
852 #[should_panic(expected = "blend_solid_gray8: dst too short")]
853 fn gray8_panics_on_short_dst() {
854 let mut dst = vec![0u8; 5];
855 blend_solid_gray8(&mut dst, 42, 10);
856 }
857}