colorutils_rs/
concat_alpha.rs1#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
9use crate::avx::*;
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11use crate::sse::*;
12#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
13use std::arch::aarch64::*;
14#[cfg(target_arch = "x86")]
15#[allow(unused_imports)]
16use std::arch::x86::*;
17#[cfg(target_arch = "x86_64")]
18#[allow(unused_imports)]
19use std::arch::x86_64::*;
20
21pub fn append_alpha(
23 dst: &mut [f32],
24 dst_stride: u32,
25 src: &[f32],
26 src_stride: u32,
27 a_plane: &[f32],
28 a_stride: u32,
29 width: u32,
30 height: u32,
31) {
32 let mut dst_offset = 0usize;
33 let mut src_offset = 0usize;
34 let mut a_offset = 0usize;
35
36 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
37 let mut _use_sse = std::arch::is_x86_feature_detected!("sse4.1");
38
39 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
40 let mut _use_avx = std::arch::is_x86_feature_detected!("avx2");
41
42 for _ in 0..height {
43 let mut _cx = 0usize;
44
45 let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 };
46 let a_ptr = unsafe { (a_plane.as_ptr() as *const u8).add(a_offset) as *const f32 };
47 let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
48
49 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
50 unsafe {
51 if _use_avx {
52 concat_alpha_avx(width, _cx, src_ptr, a_ptr, dst_ptr);
53 }
54 }
55
56 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
57 unsafe {
58 if _use_sse {
59 concat_alpha_sse(width, _cx, src_ptr, a_ptr, dst_ptr);
60 }
61 }
62
63 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
64 unsafe {
65 while _cx + 4 < width as usize {
66 let xyz_pixel = vld3q_f32(src_ptr.add(_cx * 3usize));
67 let a_pixel = vld1q_f32(a_ptr.add(_cx));
68 let dst_pixel = float32x4x4_t(xyz_pixel.0, xyz_pixel.1, xyz_pixel.2, a_pixel);
69 vst4q_f32(dst_ptr.add(_cx * 4), dst_pixel);
70 _cx += 4;
71 }
72 }
73
74 for x in _cx..width as usize {
75 unsafe {
76 let px = x * 4;
77 let s_x = x * 3;
78 let dst = dst_ptr.add(px);
79 let src = src_ptr.add(s_x);
80 dst.write_unaligned(src.read_unaligned());
81 dst.add(1).write_unaligned(src.add(1).read_unaligned());
82 dst.add(2).write_unaligned(src.add(2).read_unaligned());
83 dst.add(3).write_unaligned(a_ptr.add(x).read_unaligned());
84 }
85 }
86
87 dst_offset += dst_stride as usize;
88 a_offset += a_stride as usize;
89 src_offset += src_stride as usize;
90 }
91}
92
93#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
94#[target_feature(enable = "sse4.1")]
95#[inline]
96unsafe fn concat_alpha_sse(
97 width: u32,
98 mut _cx: usize,
99 src_ptr: *const f32,
100 a_ptr: *const f32,
101 dst_ptr: *mut f32,
102) {
103 while _cx + 4 < width as usize {
104 let xyz_chan_ptr = src_ptr.add(_cx * 3usize);
105 let a_chan_ptr = a_ptr.add(_cx);
106 let xyz0 = _mm_loadu_ps(xyz_chan_ptr);
107 let xyz1 = _mm_loadu_ps(xyz_chan_ptr.add(4));
108 let xyz2 = _mm_loadu_ps(xyz_chan_ptr.add(8));
109 let a_pixel = _mm_loadu_ps(a_chan_ptr);
110 let (x_p, y_p, z_p) = sse_deinterleave_rgb_ps(xyz0, xyz1, xyz2);
111 let (xyza0, xyza1, xyza2, xyza3) = sse_interleave_ps_rgba(x_p, y_p, z_p, a_pixel);
112 let xyza_chan_ptr = dst_ptr.add(_cx * 4usize);
113 _mm_storeu_ps(xyza_chan_ptr, xyza0);
114 _mm_storeu_ps(xyza_chan_ptr.add(4), xyza1);
115 _mm_storeu_ps(xyza_chan_ptr.add(8), xyza2);
116 _mm_storeu_ps(xyza_chan_ptr.add(12), xyza3);
117 _cx += 4;
118 }
119}
120
121#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
122#[target_feature(enable = "avx2")]
123#[inline]
124unsafe fn concat_alpha_avx(
125 width: u32,
126 mut _cx: usize,
127 src_ptr: *const f32,
128 a_ptr: *const f32,
129 dst_ptr: *mut f32,
130) {
131 while _cx + 8 < width as usize {
132 let xyz_chan_ptr = src_ptr.add(_cx * 3usize);
133 let a_chan_ptr = a_ptr.add(_cx);
134 let xyz0 = _mm256_loadu_ps(xyz_chan_ptr);
135 let xyz1 = _mm256_loadu_ps(xyz_chan_ptr.add(8));
136 let xyz2 = _mm256_loadu_ps(xyz_chan_ptr.add(16));
137 let a_pixel = _mm256_loadu_ps(a_chan_ptr);
138 let (x_p, y_p, z_p) = avx2_deinterleave_rgb_ps(xyz0, xyz1, xyz2);
139
140 let xyza_chan_ptr = dst_ptr.add(_cx * 4usize);
141
142 let (xyza0, xyza1, xyza2, xyza3) = avx2_interleave_rgba_ps(x_p, y_p, z_p, a_pixel);
143 _mm256_store_ps(xyza_chan_ptr, xyza0);
144 _mm256_store_ps(xyza_chan_ptr.add(8), xyza1);
145 _mm256_store_ps(xyza_chan_ptr.add(16), xyza2);
146 _mm256_store_ps(xyza_chan_ptr.add(32), xyza3);
147 _cx += 8;
148 }
149}