colorutils_rs/
concat_alpha.rs

1/*
2 * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
3 * //
4 * // Use of this source code is governed by a BSD-style
5 * // license that can be found in the LICENSE file.
6 */
7
8#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
9use crate::avx::*;
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11use crate::sse::*;
12#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
13use std::arch::aarch64::*;
14#[cfg(target_arch = "x86")]
15#[allow(unused_imports)]
16use std::arch::x86::*;
17#[cfg(target_arch = "x86_64")]
18#[allow(unused_imports)]
19use std::arch::x86_64::*;
20
21/// Adds alpha plane into an existing RGB/XYZ/LAB or other 3 plane image. Image will become RGBA, XYZa, LABa etc.
22pub fn append_alpha(
23    dst: &mut [f32],
24    dst_stride: u32,
25    src: &[f32],
26    src_stride: u32,
27    a_plane: &[f32],
28    a_stride: u32,
29    width: u32,
30    height: u32,
31) {
32    let mut dst_offset = 0usize;
33    let mut src_offset = 0usize;
34    let mut a_offset = 0usize;
35
36    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
37    let mut _use_sse = std::arch::is_x86_feature_detected!("sse4.1");
38
39    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
40    let mut _use_avx = std::arch::is_x86_feature_detected!("avx2");
41
42    for _ in 0..height {
43        let mut _cx = 0usize;
44
45        let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 };
46        let a_ptr = unsafe { (a_plane.as_ptr() as *const u8).add(a_offset) as *const f32 };
47        let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
48
49        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
50        unsafe {
51            if _use_avx {
52                concat_alpha_avx(width, _cx, src_ptr, a_ptr, dst_ptr);
53            }
54        }
55
56        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
57        unsafe {
58            if _use_sse {
59                concat_alpha_sse(width, _cx, src_ptr, a_ptr, dst_ptr);
60            }
61        }
62
63        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
64        unsafe {
65            while _cx + 4 < width as usize {
66                let xyz_pixel = vld3q_f32(src_ptr.add(_cx * 3usize));
67                let a_pixel = vld1q_f32(a_ptr.add(_cx));
68                let dst_pixel = float32x4x4_t(xyz_pixel.0, xyz_pixel.1, xyz_pixel.2, a_pixel);
69                vst4q_f32(dst_ptr.add(_cx * 4), dst_pixel);
70                _cx += 4;
71            }
72        }
73
74        for x in _cx..width as usize {
75            unsafe {
76                let px = x * 4;
77                let s_x = x * 3;
78                let dst = dst_ptr.add(px);
79                let src = src_ptr.add(s_x);
80                dst.write_unaligned(src.read_unaligned());
81                dst.add(1).write_unaligned(src.add(1).read_unaligned());
82                dst.add(2).write_unaligned(src.add(2).read_unaligned());
83                dst.add(3).write_unaligned(a_ptr.add(x).read_unaligned());
84            }
85        }
86
87        dst_offset += dst_stride as usize;
88        a_offset += a_stride as usize;
89        src_offset += src_stride as usize;
90    }
91}
92
93#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
94#[target_feature(enable = "sse4.1")]
95#[inline]
96unsafe fn concat_alpha_sse(
97    width: u32,
98    mut _cx: usize,
99    src_ptr: *const f32,
100    a_ptr: *const f32,
101    dst_ptr: *mut f32,
102) {
103    while _cx + 4 < width as usize {
104        let xyz_chan_ptr = src_ptr.add(_cx * 3usize);
105        let a_chan_ptr = a_ptr.add(_cx);
106        let xyz0 = _mm_loadu_ps(xyz_chan_ptr);
107        let xyz1 = _mm_loadu_ps(xyz_chan_ptr.add(4));
108        let xyz2 = _mm_loadu_ps(xyz_chan_ptr.add(8));
109        let a_pixel = _mm_loadu_ps(a_chan_ptr);
110        let (x_p, y_p, z_p) = sse_deinterleave_rgb_ps(xyz0, xyz1, xyz2);
111        let (xyza0, xyza1, xyza2, xyza3) = sse_interleave_ps_rgba(x_p, y_p, z_p, a_pixel);
112        let xyza_chan_ptr = dst_ptr.add(_cx * 4usize);
113        _mm_storeu_ps(xyza_chan_ptr, xyza0);
114        _mm_storeu_ps(xyza_chan_ptr.add(4), xyza1);
115        _mm_storeu_ps(xyza_chan_ptr.add(8), xyza2);
116        _mm_storeu_ps(xyza_chan_ptr.add(12), xyza3);
117        _cx += 4;
118    }
119}
120
121#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
122#[target_feature(enable = "avx2")]
123#[inline]
124unsafe fn concat_alpha_avx(
125    width: u32,
126    mut _cx: usize,
127    src_ptr: *const f32,
128    a_ptr: *const f32,
129    dst_ptr: *mut f32,
130) {
131    while _cx + 8 < width as usize {
132        let xyz_chan_ptr = src_ptr.add(_cx * 3usize);
133        let a_chan_ptr = a_ptr.add(_cx);
134        let xyz0 = _mm256_loadu_ps(xyz_chan_ptr);
135        let xyz1 = _mm256_loadu_ps(xyz_chan_ptr.add(8));
136        let xyz2 = _mm256_loadu_ps(xyz_chan_ptr.add(16));
137        let a_pixel = _mm256_loadu_ps(a_chan_ptr);
138        let (x_p, y_p, z_p) = avx2_deinterleave_rgb_ps(xyz0, xyz1, xyz2);
139
140        let xyza_chan_ptr = dst_ptr.add(_cx * 4usize);
141
142        let (xyza0, xyza1, xyza2, xyza3) = avx2_interleave_rgba_ps(x_p, y_p, z_p, a_pixel);
143        _mm256_store_ps(xyza_chan_ptr, xyza0);
144        _mm256_store_ps(xyza_chan_ptr.add(8), xyza1);
145        _mm256_store_ps(xyza_chan_ptr.add(16), xyza2);
146        _mm256_store_ps(xyza_chan_ptr.add(32), xyza3);
147        _cx += 8;
148    }
149}