1use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA};
2#[cfg(target_arch = "x86")]
3pub use std::arch::x86::{
4 __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256,
5 _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps,
6 _mm256_mul_ps, _mm256_set1_ps, _mm256_setzero_ps, _mm256_store_si256, _mm_add_ps,
7 _mm_broadcast_ss, _mm_cvtps_epi32, _mm_max_ps, _mm_min_ps, _mm_mul_ps, _mm_store_si128,
8};
9#[cfg(target_arch = "x86_64")]
10pub use std::arch::x86_64::{
11 __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256,
12 _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps,
13 _mm256_mul_ps, _mm256_set1_ps, _mm256_setzero_ps, _mm256_store_si256, _mm_add_ps,
14 _mm_broadcast_ss, _mm_cvtps_epi32, _mm_max_ps, _mm_min_ps, _mm_mul_ps, _mm_store_si128,
15};
16
17#[repr(align(32))]
18struct Output([u32; 8]);
19
20#[target_feature(enable = "avx")]
21unsafe extern "C" fn qcms_transform_data_template_lut_avx<F: Format>(
22 transform: &qcms_transform,
23 mut src: *const u8,
24 mut dest: *mut u8,
25 mut length: usize,
26) {
27 let mat: *const [f32; 4] = (*transform).matrix.as_ptr();
28 let mut input: Output = std::mem::zeroed();
29 let output: *const u32 = &mut input as *mut Output as *mut u32;
32 let igtbl_r: *const f32 = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr();
34 let igtbl_g: *const f32 = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr();
35 let igtbl_b: *const f32 = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr();
36 let otdata_r: *const u8 = (*transform)
38 .output_table_r
39 .as_deref()
40 .unwrap()
41 .data
42 .as_ptr();
43 let otdata_g: *const u8 = (*transform)
44 .output_table_g
45 .as_deref()
46 .unwrap()
47 .data
48 .as_ptr();
49 let otdata_b: *const u8 = (*transform)
50 .output_table_b
51 .as_deref()
52 .unwrap()
53 .data
54 .as_ptr();
55 let mat0: __m256 = _mm256_broadcast_ps(&*((*mat.offset(0isize)).as_ptr() as *const __m128));
57 let mat1: __m256 = _mm256_broadcast_ps(&*((*mat.offset(1isize)).as_ptr() as *const __m128));
58 let mat2: __m256 = _mm256_broadcast_ps(&*((*mat.offset(2isize)).as_ptr() as *const __m128));
59 let max: __m256 = _mm256_set1_ps(CLAMPMAXVAL);
61 let min: __m256 = _mm256_setzero_ps();
62 let scale: __m256 = _mm256_set1_ps(FLOATSCALE);
63 let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
64 let mut vec_r: __m256 = _mm256_setzero_ps();
66 let mut vec_g: __m256 = _mm256_setzero_ps();
67 let mut vec_b: __m256 = _mm256_setzero_ps();
68 let mut result: __m256;
69 let mut vec_r0: __m128;
70 let mut vec_g0: __m128;
71 let mut vec_b0: __m128;
72 let mut vec_r1: __m128;
73 let mut vec_g1: __m128;
74 let mut vec_b1: __m128;
75 let mut alpha1: u8 = 0;
76 let mut alpha2: u8 = 0;
77 if length == 0 {
79 return;
80 }
81 if length > 1 {
84 vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
85 vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
86 vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
87 vec_r1 =
88 _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize));
89 vec_g1 =
90 _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize));
91 vec_b1 =
92 _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize));
93 vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
94 vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
95 vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
96 if F::kAIndex != 0xff {
97 alpha1 = *src.add(F::kAIndex);
98 alpha2 = *src.add(F::kAIndex + components as usize)
99 }
100 }
101 while length > 3 {
104 src = src.offset((2 * components) as isize);
106 vec_r = _mm256_mul_ps(vec_r, mat0);
108 vec_g = _mm256_mul_ps(vec_g, mat1);
109 vec_b = _mm256_mul_ps(vec_b, mat2);
110 if F::kAIndex != 0xff {
112 *dest.add(F::kAIndex) = alpha1;
113 *dest.add(F::kAIndex + components as usize) = alpha2;
114 alpha1 = *src.add(F::kAIndex);
115 alpha2 = *src.add(F::kAIndex + components as usize)
116 }
117 vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
119 vec_r = _mm256_max_ps(min, vec_r);
120 vec_r = _mm256_min_ps(max, vec_r);
121 result = _mm256_mul_ps(vec_r, scale);
122 _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result));
124 vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
126 vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
127 vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
128 vec_r1 =
129 _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize));
130 vec_g1 =
131 _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize));
132 vec_b1 =
133 _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize));
134 vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
135 vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
136 vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
137 *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
139 *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
140 *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
141 *dest.add(F::kRIndex + components as usize) =
142 *otdata_r.offset(*output.offset(4isize) as isize);
143 *dest.add(F::kGIndex + components as usize) =
144 *otdata_g.offset(*output.offset(5isize) as isize);
145 *dest.add(F::kBIndex + components as usize) =
146 *otdata_b.offset(*output.offset(6isize) as isize);
147 dest = dest.offset((2 * components) as isize);
148 length -= 2
149 }
150 if length > 1 {
153 vec_r = _mm256_mul_ps(vec_r, mat0);
154 vec_g = _mm256_mul_ps(vec_g, mat1);
155 vec_b = _mm256_mul_ps(vec_b, mat2);
156 if F::kAIndex != 0xff {
157 *dest.add(F::kAIndex) = alpha1;
158 *dest.add(F::kAIndex + components as usize) = alpha2
159 }
160 vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
161 vec_r = _mm256_max_ps(min, vec_r);
162 vec_r = _mm256_min_ps(max, vec_r);
163 result = _mm256_mul_ps(vec_r, scale);
164 _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result));
165 *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
166 *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
167 *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
168 *dest.add(F::kRIndex + components as usize) =
169 *otdata_r.offset(*output.offset(4isize) as isize);
170 *dest.add(F::kGIndex + components as usize) =
171 *otdata_g.offset(*output.offset(5isize) as isize);
172 *dest.add(F::kBIndex + components as usize) =
173 *otdata_b.offset(*output.offset(6isize) as isize);
174 src = src.offset((2 * components) as isize);
175 dest = dest.offset((2 * components) as isize);
176 length -= 2
177 }
178 if length == 1 {
180 vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
181 vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
182 vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
183 vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(mat0));
184 vec_g0 = _mm_mul_ps(vec_g0, _mm256_castps256_ps128(mat1));
185 vec_b0 = _mm_mul_ps(vec_b0, _mm256_castps256_ps128(mat2));
186 if F::kAIndex != 0xff {
187 *dest.add(F::kAIndex) = *src.add(F::kAIndex)
188 }
189 vec_r0 = _mm_add_ps(vec_r0, _mm_add_ps(vec_g0, vec_b0));
190 vec_r0 = _mm_max_ps(_mm256_castps256_ps128(min), vec_r0);
191 vec_r0 = _mm_min_ps(_mm256_castps256_ps128(max), vec_r0);
192 vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(scale));
193 _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(vec_r0));
194 *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
195 *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
196 *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize)
197 };
198}
199#[no_mangle]
200#[target_feature(enable = "avx")]
201pub unsafe fn qcms_transform_data_rgb_out_lut_avx(
202 transform: &qcms_transform,
203 src: *const u8,
204 dest: *mut u8,
205 length: usize,
206) {
207 qcms_transform_data_template_lut_avx::<RGB>(transform, src, dest, length);
208}
209#[no_mangle]
210#[target_feature(enable = "avx")]
211pub unsafe fn qcms_transform_data_rgba_out_lut_avx(
212 transform: &qcms_transform,
213 src: *const u8,
214 dest: *mut u8,
215 length: usize,
216) {
217 qcms_transform_data_template_lut_avx::<RGBA>(transform, src, dest, length);
218}
219#[no_mangle]
220#[target_feature(enable = "avx")]
221pub unsafe fn qcms_transform_data_bgra_out_lut_avx(
222 transform: &qcms_transform,
223 src: *const u8,
224 dest: *mut u8,
225 length: usize,
226) {
227 qcms_transform_data_template_lut_avx::<BGRA>(transform, src, dest, length);
228}