qcms/
transform_avx.rs

1use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA};
2#[cfg(target_arch = "x86")]
3pub use std::arch::x86::{
4    __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256,
5    _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps,
6    _mm256_mul_ps, _mm256_set1_ps, _mm256_setzero_ps, _mm256_store_si256, _mm_add_ps,
7    _mm_broadcast_ss, _mm_cvtps_epi32, _mm_max_ps, _mm_min_ps, _mm_mul_ps, _mm_store_si128,
8};
9#[cfg(target_arch = "x86_64")]
10pub use std::arch::x86_64::{
11    __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256,
12    _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps,
13    _mm256_mul_ps, _mm256_set1_ps, _mm256_setzero_ps, _mm256_store_si256, _mm_add_ps,
14    _mm_broadcast_ss, _mm_cvtps_epi32, _mm_max_ps, _mm_min_ps, _mm_mul_ps, _mm_store_si128,
15};
16
17#[repr(align(32))]
18struct Output([u32; 8]);
19
20#[target_feature(enable = "avx")]
21unsafe extern "C" fn qcms_transform_data_template_lut_avx<F: Format>(
22    transform: &qcms_transform,
23    mut src: *const u8,
24    mut dest: *mut u8,
25    mut length: usize,
26) {
27    let mat: *const [f32; 4] = (*transform).matrix.as_ptr();
28    let mut input: Output = std::mem::zeroed();
29    /* share input and output locations to save having to keep the
30     * locations in separate registers */
31    let output: *const u32 = &mut input as *mut Output as *mut u32;
32    /* deref *transform now to avoid it in loop */
33    let igtbl_r: *const f32 = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr();
34    let igtbl_g: *const f32 = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr();
35    let igtbl_b: *const f32 = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr();
36    /* deref *transform now to avoid it in loop */
37    let otdata_r: *const u8 = (*transform)
38        .output_table_r
39        .as_deref()
40        .unwrap()
41        .data
42        .as_ptr();
43    let otdata_g: *const u8 = (*transform)
44        .output_table_g
45        .as_deref()
46        .unwrap()
47        .data
48        .as_ptr();
49    let otdata_b: *const u8 = (*transform)
50        .output_table_b
51        .as_deref()
52        .unwrap()
53        .data
54        .as_ptr();
55    /* input matrix values never change */
56    let mat0: __m256 = _mm256_broadcast_ps(&*((*mat.offset(0isize)).as_ptr() as *const __m128));
57    let mat1: __m256 = _mm256_broadcast_ps(&*((*mat.offset(1isize)).as_ptr() as *const __m128));
58    let mat2: __m256 = _mm256_broadcast_ps(&*((*mat.offset(2isize)).as_ptr() as *const __m128));
59    /* these values don't change, either */
60    let max: __m256 = _mm256_set1_ps(CLAMPMAXVAL);
61    let min: __m256 = _mm256_setzero_ps();
62    let scale: __m256 = _mm256_set1_ps(FLOATSCALE);
63    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
64    /* working variables */
65    let mut vec_r: __m256 = _mm256_setzero_ps();
66    let mut vec_g: __m256 = _mm256_setzero_ps();
67    let mut vec_b: __m256 = _mm256_setzero_ps();
68    let mut result: __m256;
69    let mut vec_r0: __m128;
70    let mut vec_g0: __m128;
71    let mut vec_b0: __m128;
72    let mut vec_r1: __m128;
73    let mut vec_g1: __m128;
74    let mut vec_b1: __m128;
75    let mut alpha1: u8 = 0;
76    let mut alpha2: u8 = 0;
77    /* CYA */
78    if length == 0 {
79        return;
80    }
81    /* If there are at least 2 pixels, then we can load their components into
82    a single 256-bit register for processing. */
83    if length > 1 {
84        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
85        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
86        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
87        vec_r1 =
88            _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize));
89        vec_g1 =
90            _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize));
91        vec_b1 =
92            _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize));
93        vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
94        vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
95        vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
96        if F::kAIndex != 0xff {
97            alpha1 = *src.add(F::kAIndex);
98            alpha2 = *src.add(F::kAIndex + components as usize)
99        }
100    }
101    /* If there are at least 4 pixels, then we can iterate and preload the
102    next 2 while we store the result of the current 2. */
103    while length > 3 {
104        /* Ensure we are pointing at the next 2 pixels for the next load. */
105        src = src.offset((2 * components) as isize);
106        /* gamma * matrix */
107        vec_r = _mm256_mul_ps(vec_r, mat0);
108        vec_g = _mm256_mul_ps(vec_g, mat1);
109        vec_b = _mm256_mul_ps(vec_b, mat2);
110        /* store alpha for these pixels; load alpha for next two */
111        if F::kAIndex != 0xff {
112            *dest.add(F::kAIndex) = alpha1;
113            *dest.add(F::kAIndex + components as usize) = alpha2;
114            alpha1 = *src.add(F::kAIndex);
115            alpha2 = *src.add(F::kAIndex + components as usize)
116        }
117        /* crunch, crunch, crunch */
118        vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
119        vec_r = _mm256_max_ps(min, vec_r);
120        vec_r = _mm256_min_ps(max, vec_r);
121        result = _mm256_mul_ps(vec_r, scale);
122        /* store calc'd output tables indices */
123        _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result));
124        /* load gamma values for next loop while store completes */
125        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
126        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
127        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
128        vec_r1 =
129            _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize));
130        vec_g1 =
131            _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize));
132        vec_b1 =
133            _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize));
134        vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
135        vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
136        vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
137        /* use calc'd indices to output RGB values */
138        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
139        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
140        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
141        *dest.add(F::kRIndex + components as usize) =
142            *otdata_r.offset(*output.offset(4isize) as isize);
143        *dest.add(F::kGIndex + components as usize) =
144            *otdata_g.offset(*output.offset(5isize) as isize);
145        *dest.add(F::kBIndex + components as usize) =
146            *otdata_b.offset(*output.offset(6isize) as isize);
147        dest = dest.offset((2 * components) as isize);
148        length -= 2
149    }
150    /* There are 0-3 pixels remaining. If there are 2-3 remaining, then we know
151    we have already populated the necessary registers to start the transform. */
152    if length > 1 {
153        vec_r = _mm256_mul_ps(vec_r, mat0);
154        vec_g = _mm256_mul_ps(vec_g, mat1);
155        vec_b = _mm256_mul_ps(vec_b, mat2);
156        if F::kAIndex != 0xff {
157            *dest.add(F::kAIndex) = alpha1;
158            *dest.add(F::kAIndex + components as usize) = alpha2
159        }
160        vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
161        vec_r = _mm256_max_ps(min, vec_r);
162        vec_r = _mm256_min_ps(max, vec_r);
163        result = _mm256_mul_ps(vec_r, scale);
164        _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result));
165        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
166        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
167        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
168        *dest.add(F::kRIndex + components as usize) =
169            *otdata_r.offset(*output.offset(4isize) as isize);
170        *dest.add(F::kGIndex + components as usize) =
171            *otdata_g.offset(*output.offset(5isize) as isize);
172        *dest.add(F::kBIndex + components as usize) =
173            *otdata_b.offset(*output.offset(6isize) as isize);
174        src = src.offset((2 * components) as isize);
175        dest = dest.offset((2 * components) as isize);
176        length -= 2
177    }
178    /* There may be 0-1 pixels remaining. */
179    if length == 1 {
180        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
181        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
182        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
183        vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(mat0));
184        vec_g0 = _mm_mul_ps(vec_g0, _mm256_castps256_ps128(mat1));
185        vec_b0 = _mm_mul_ps(vec_b0, _mm256_castps256_ps128(mat2));
186        if F::kAIndex != 0xff {
187            *dest.add(F::kAIndex) = *src.add(F::kAIndex)
188        }
189        vec_r0 = _mm_add_ps(vec_r0, _mm_add_ps(vec_g0, vec_b0));
190        vec_r0 = _mm_max_ps(_mm256_castps256_ps128(min), vec_r0);
191        vec_r0 = _mm_min_ps(_mm256_castps256_ps128(max), vec_r0);
192        vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(scale));
193        _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(vec_r0));
194        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
195        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
196        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize)
197    };
198}
199#[no_mangle]
200#[target_feature(enable = "avx")]
201pub unsafe fn qcms_transform_data_rgb_out_lut_avx(
202    transform: &qcms_transform,
203    src: *const u8,
204    dest: *mut u8,
205    length: usize,
206) {
207    qcms_transform_data_template_lut_avx::<RGB>(transform, src, dest, length);
208}
209#[no_mangle]
210#[target_feature(enable = "avx")]
211pub unsafe fn qcms_transform_data_rgba_out_lut_avx(
212    transform: &qcms_transform,
213    src: *const u8,
214    dest: *mut u8,
215    length: usize,
216) {
217    qcms_transform_data_template_lut_avx::<RGBA>(transform, src, dest, length);
218}
219#[no_mangle]
220#[target_feature(enable = "avx")]
221pub unsafe fn qcms_transform_data_bgra_out_lut_avx(
222    transform: &qcms_transform,
223    src: *const u8,
224    dest: *mut u8,
225    length: usize,
226) {
227    qcms_transform_data_template_lut_avx::<BGRA>(transform, src, dest, length);
228}