1#![cfg(target_endian = "little")]
13#![allow(dead_code)]
14#![allow(unused_macros)]
15
16use super::*;
17
18#[inline(always)]
34pub fn u32_to_linear(pixel: u32) -> [f32; 4] {
35 let r = square(((pixel & 0xFF) as f32) / 255.0);
37 let g = square(((pixel >> 8 & 0xFF) as f32) / 255.0);
38 let b = square(((pixel >> 16 & 0xFF) as f32) / 255.0);
39 let a = ((pixel >> 24 & 0xFF) as f32) / 255.0;
40 [r, g, b, a]
41}
42
43#[inline(always)]
49pub fn linear_to_u32(linear: [f32; 4]) -> u32 {
50 let r = (sqrt(linear[0]) * 255.0 + 0.5) as u32;
52 let g = (sqrt(linear[1]) * 255.0 + 0.5) as u32;
53 let b = (sqrt(linear[2]) * 255.0 + 0.5) as u32;
54 let a = (linear[3] * 255.0 + 0.5) as u32;
55 rgba32!(r, g, b, a)
56}
57
58pub trait WritableImageU32Ext: WritableImage<u32> {
62 fn blit_blend_rectilinear<RI>(&mut self, src: &RI, offset: (isize, isize))
74 where
75 RI: ReadableImage<u32>,
76 {
77 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
79 {
80 #[cfg(feature = "std")]
84 {
85 if is_x86_feature_detected!("avx2") {
86 unsafe { blit_blend_rectilinear_avx2_explicit(self, src, offset) };
87 } else if is_x86_feature_detected!("sse2") {
88 unsafe { blit_blend_rectilinear_sse2_explicit(self, src, offset) };
89 } else {
90 unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
92 }
93 }
94 #[cfg(all(not(feature = "std"), target_feature = "avx2"))]
95 {
96 unsafe { blit_blend_rectilinear_avx2_explicit(self, src, offset) };
97 }
98 #[cfg(all(not(feature = "std"), not(target_feature = "avx2"), target_feature = "sse2"))]
99 {
100 unsafe { blit_blend_rectilinear_sse2_explicit(self, src, offset) };
101 }
102 #[cfg(all(not(feature = "std"), not(target_feature = "avx2"), not(target_feature = "sse2")))]
103 {
104 unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
105 }
106 }
107 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
109 {
110 unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
113 }
114 }
115}
116
117unsafe fn blit_blend_rectilinear_fully_unrolled_no_intrinsics<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
122where
123 WI: WritableImage<u32> + ?Sized,
124 RI: ReadableImage<u32>,
125{
126 let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
127 determine_overlay!(dest, src, offset);
128 if clip_width > 0 && clip_height > 0 {
129 let src_pitch = src.pitch();
130 let dest_pitch = dest.pitch();
131 let mut y = 0;
132 while y < clip_height {
133 let mut x = 0;
134 let mut src_row_mid_ptr = src_row_start_ptr;
135 let mut dest_row_mid_ptr = dest_row_start_ptr;
136 while x < clip_width {
137 let mut src_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
144 let mut src_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
145 let mut src_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
146 let mut src_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
147 let mut dest_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
148 let mut dest_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
149 let mut dest_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
150 let mut dest_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
151 let mut out_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
152 let mut out_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
153 let mut out_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
154 let mut out_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
155 for lane in 0..SSE_LANE_WIDTH {
157 const INV255: f32 = 1.0 / 255.0;
158 let lane_i = lane as isize;
159 if x + lane < clip_width {
160 let src_pixel = src_row_mid_ptr.offset(lane_i);
161 src_r[lane] = square((*src_pixel & 0xFF) as f32 * INV255);
162 src_g[lane] = square((*src_pixel >> 8 & 0xFF) as f32 * INV255);
163 src_b[lane] = square((*src_pixel >> 16 & 0xFF) as f32 * INV255);
164 src_a[lane] = (*src_pixel >> 24 & 0xFF) as f32 * INV255;
165 let dest_pixel = dest_row_mid_ptr.offset(lane_i);
167 dest_r[lane] = square((*dest_pixel & 0xFF) as f32 * INV255);
168 dest_g[lane] = square((*dest_pixel >> 8 & 0xFF) as f32 * INV255);
169 dest_b[lane] = square((*dest_pixel >> 16 & 0xFF) as f32 * INV255);
170 dest_a[lane] = (*dest_pixel >> 24 & 0xFF) as f32 * INV255;
171 } else {
172 break;
173 }
174 }
175 for lane in 0..SSE_LANE_WIDTH {
177 let toward = src_a[lane];
178 let one_minus_toward = 1.0 - toward;
179 out_r[lane] = one_minus_toward * dest_r[lane] + toward * src_r[lane];
180 out_g[lane] = one_minus_toward * dest_g[lane] + toward * src_g[lane];
181 out_b[lane] = one_minus_toward * dest_b[lane] + toward * src_b[lane];
182 out_a[lane] = one_minus_toward * dest_a[lane] + toward * src_a[lane];
183 }
184 for lane in 0..SSE_LANE_WIDTH {
186 let lane_i = lane as isize;
187 if x + lane < clip_width {
188 let out32 = (((out_a[lane] * 255.0 + 0.5) as u32) << 24) | (((sqrt(out_b[lane]) * 255.0 + 0.5) as u32) << 16)
189 | (((sqrt(out_g[lane]) * 255.0 + 0.5) as u32) << 8) | ((sqrt(out_r[lane]) * 255.0 + 0.5) as u32);
190 *dest_row_mid_ptr.offset(lane_i) = out32;
191 } else {
192 break;
193 }
194 }
195 x += SSE_LANE_WIDTH;
196 src_row_mid_ptr = src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
197 dest_row_mid_ptr = dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
198 }
199 y += 1;
200 src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
201 dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
202 }
203 }
204}
205
206#[allow(unused_macros)]
214macro_rules! m128_f32 {
215 ($a:ident, $i:expr) => {
216 *(((&mut $a) as *mut __m128) as *mut f32).offset($i as isize)
217 };
218}
219
220#[allow(unused_macros)]
222macro_rules! m128i_i32 {
223 ($a:ident, $i:expr) => {
224 *(((&mut $a) as *mut __m128i) as *mut i32).offset($i as isize)
225 };
226}
227
228#[allow(unused_macros)]
230macro_rules! mm_square {
231 ($reg:expr) => {
232 _mm_mul_ps($reg, $reg)
233 };
234}
235
236#[allow(unused_macros)]
238macro_rules! print_128 {
239 ($reg:ident,i) => {{
240 let arr = (&$reg as *const __m128i as *const [u32; 4]).as_ref().unwrap();
241 println!("{}: i[{:08X},{:08X},{:08X},{:08X}]", stringify!($reg), arr[0], arr[1], arr[2], arr[3]);
242 }};
243 ($reg:ident,f) => {{
244 let arr = (&$reg as *const __m128 as *const [f32; 4]).as_ref().unwrap();
245 println!("{}: f[{:7.1},{:7.1},{:7.1},{:7.1}]", stringify!($reg), arr[0], arr[1], arr[2], arr[3]);
246 }};
247}
248
249#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
251macro_rules! sse2_do_pixel_work {
252 ($src_pixel_x4:ident, $dest_pixel_x4:ident) => {{
253 let twofivefive_4x = _mm_set1_ps(255.0);
254 let inverse255_4x = _mm_set1_ps(1.0 / 255.0);
255 let one_4x = _mm_set1_ps(1.0);
256 let ff_4x = _mm_set1_epi32(0xFF);
257
258 let src_r = _mm_cvtepi32_ps(_mm_and_si128($src_pixel_x4, ff_4x));
260 let src_g = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 8), ff_4x));
261 let src_b = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 16), ff_4x));
262 let src_a = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 24), ff_4x));
263
264 let dest_r = _mm_cvtepi32_ps(_mm_and_si128($dest_pixel_x4, ff_4x));
265 let dest_g = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 8), ff_4x));
266 let dest_b = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 16), ff_4x));
267 let dest_a = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 24), ff_4x));
268
269 let src_linear_r = mm_square!(_mm_mul_ps(src_r, inverse255_4x));
271 let src_linear_g = mm_square!(_mm_mul_ps(src_g, inverse255_4x));
272 let src_linear_b = mm_square!(_mm_mul_ps(src_b, inverse255_4x));
273 let src_linear_a = _mm_mul_ps(src_a, inverse255_4x); let dest_linear_r = mm_square!(_mm_mul_ps(dest_r, inverse255_4x));
276 let dest_linear_g = mm_square!(_mm_mul_ps(dest_g, inverse255_4x));
277 let dest_linear_b = mm_square!(_mm_mul_ps(dest_b, inverse255_4x));
278 let dest_linear_a = _mm_mul_ps(dest_a, inverse255_4x); let toward = src_linear_a;
281 let one_minus_toward = _mm_sub_ps(one_4x, toward);
282
283 let out_r = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_r), _mm_mul_ps(toward, src_linear_r));
284 let out_g = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_g), _mm_mul_ps(toward, src_linear_g));
285 let out_b = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_b), _mm_mul_ps(toward, src_linear_b));
286 let out_a = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_a), _mm_mul_ps(toward, src_linear_a));
287
288 let out_r_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_r), twofivefive_4x));
290 let out_g_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_g), twofivefive_4x));
291 let out_b_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_b), twofivefive_4x));
292 let out_a_i32 = _mm_cvtps_epi32(_mm_mul_ps(out_a, twofivefive_4x));
293
294 let out_xxgr_i32 = _mm_or_si128(_mm_slli_epi32(out_g_i32, 8), out_r_i32);
296 let out_abxx_i32 = _mm_or_si128(_mm_slli_epi32(out_a_i32, 24), _mm_slli_epi32(out_b_i32, 16));
297 _mm_or_si128(out_abxx_i32, out_xxgr_i32)
298 }};
299}
300
301#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
303macro_rules! sse2_finish_off_row_aligned {
304 ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
305 debug_assert_eq!(
306 check_misalign16!($src_row_mid_ptr),
307 0,
308 "sse2_finish_off_row_aligned, the src_row_mid_ptr isn't aligned: {} / {}",
309 check_misalign16!($src_row_mid_ptr),
310 $src_row_mid_ptr as usize
311 );
312 debug_assert_eq!(
313 check_misalign16!($dest_row_mid_ptr),
314 0,
315 "sse2_finish_off_row_aligned, the dest_row_mid_ptr isn't aligned: {} / {}",
316 check_misalign16!($dest_row_mid_ptr),
317 $dest_row_mid_ptr as usize
318 );
319 if check_misalign4!($clip_width as isize - $x as isize) == 0 {
320 while $x < $clip_width {
323 let src_pixel_x4 = _mm_load_si128($src_row_mid_ptr as *const __m128i);
324 let dest_pixel_x4 = _mm_load_si128($dest_row_mid_ptr as *const __m128i);
325 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
326 _mm_store_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
327 $x += SSE_LANE_WIDTH;
328 $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
329 $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
330 }
331 } else {
332 while $x < $clip_width {
336 match $clip_width as isize - $x as isize {
337 1 => {
338 let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, 0, 0, 0);
339 let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, 0, 0, 0);
340 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
341 *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
342 }
343 2 => {
344 let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), 0, 0);
345 let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), 0, 0);
346 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
347 *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
348 *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
349 }
350 3 => {
351 let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), *$src_row_mid_ptr.offset(2), 0);
352 let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), *$dest_row_mid_ptr.offset(2), 0);
353 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
354 *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
355 *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
356 *$dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
357 }
358 other => {
359 debug_assert!(other >= 4);
360 let src_pixel_x4 = _mm_load_si128($src_row_mid_ptr as *const __m128i);
361 let dest_pixel_x4 = _mm_load_si128($dest_row_mid_ptr as *const __m128i);
362 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
363 _mm_store_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
364 }
365 }
366 $x += SSE_LANE_WIDTH;
367 $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
368 $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
369 }
370 }
371 }};
372}
373
374#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
376macro_rules! sse2_finish_off_row_un_aligned {
377 ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
378 if check_misalign4!($clip_width as isize - $x as isize) == 0 {
379 while $x < $clip_width {
382 let src_pixel_x4 = _mm_loadu_si128($src_row_mid_ptr as *const __m128i);
383 let dest_pixel_x4 = _mm_loadu_si128($dest_row_mid_ptr as *const __m128i);
384 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
385 _mm_storeu_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
386 $x += SSE_LANE_WIDTH;
387 $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
388 $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
389 }
390 } else {
391 while $x < $clip_width {
395 match $clip_width as isize - $x as isize {
396 1 => {
397 let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, 0, 0, 0);
398 let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, 0, 0, 0);
399 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
400 *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
401 }
402 2 => {
403 let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), 0, 0);
404 let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), 0, 0);
405 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
406 *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
407 *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
408 }
409 3 => {
410 let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), *$src_row_mid_ptr.offset(2), 0);
411 let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), *$dest_row_mid_ptr.offset(2), 0);
412 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
413 *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
414 *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
415 *$dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
416 }
417 other => {
418 debug_assert!(other >= 4);
419 let src_pixel_x4 = _mm_loadu_si128($src_row_mid_ptr as *const __m128i);
420 let dest_pixel_x4 = _mm_loadu_si128($dest_row_mid_ptr as *const __m128i);
421 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
422 _mm_storeu_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
423 }
424 }
425 $x += SSE_LANE_WIDTH;
426 $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
427 $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
428 }
429 }
430 }};
431}
432
433#[target_feature(enable = "sse2")]
438#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
439unsafe fn blit_blend_rectilinear_sse2_explicit<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
440where
441 WI: WritableImage<u32> + ?Sized,
442 RI: ReadableImage<u32>,
443{
444 let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
445 determine_overlay!(dest, src, offset);
446
447 if clip_width > 0 && clip_height > 0 {
448 let src_pitch = src.pitch();
449 let dest_pitch = dest.pitch();
450 let mut y = 0;
451 while y < clip_height {
452 let mut x = 0;
453 let mut src_row_mid_ptr = src_row_start_ptr as *const i32;
454 let mut dest_row_mid_ptr = dest_row_start_ptr as *mut i32;
455 let src_misalign = check_misalign16!(src_row_mid_ptr);
456 let dest_misalign = check_misalign16!(dest_row_mid_ptr);
457 if src_misalign > 0 || dest_misalign > 0 {
458 if src_misalign == dest_misalign {
460 match src_misalign {
463 12 => {
464 let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, 0, 0, 0);
466 let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, 0, 0, 0);
467 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
468 *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
469 src_row_mid_ptr = src_row_mid_ptr.offset(1);
470 dest_row_mid_ptr = dest_row_mid_ptr.offset(1);
471 x += 1;
472 }
473 8 => {
474 let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, *src_row_mid_ptr.offset(1), 0, 0);
476 let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, *dest_row_mid_ptr.offset(1), 0, 0);
477 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
478 *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
479 *dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
480 src_row_mid_ptr = src_row_mid_ptr.offset(2);
481 dest_row_mid_ptr = dest_row_mid_ptr.offset(2);
482 x += 2;
483 }
484 4 => {
485 let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, *src_row_mid_ptr.offset(1), *src_row_mid_ptr.offset(2), 0);
487 let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, *dest_row_mid_ptr.offset(1), *dest_row_mid_ptr.offset(2), 0);
488 let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
489 *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
490 *dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
491 *dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
492 src_row_mid_ptr = src_row_mid_ptr.offset(3);
493 dest_row_mid_ptr = dest_row_mid_ptr.offset(3);
494 x += 3;
495 }
496 other => panic!("invalid src_misalign value: {}", other),
497 };
498 sse2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
500 } else {
501 sse2_finish_off_row_un_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
504 }
505 } else {
506 sse2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
508 }
509 y += 1;
510 src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
511 dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
512 }
513 }
514}
515
516macro_rules! m256_f32 {
524 ($a:ident, $i:expr) => {
525 *(((&mut $a) as *mut __m256) as *mut f32).offset($i as isize)
526 };
527}
528
529macro_rules! m256i_i32 {
531 ($a:ident, $i:expr) => {
532 *(((&mut $a) as *mut __m256i) as *mut i32).offset($i as isize)
533 };
534}
535
536macro_rules! mm256_square {
538 ($reg:expr) => {
539 _mm256_mul_ps($reg, $reg)
540 };
541}
542
543#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
545macro_rules! avx2_do_pixel_work {
546 ($src_pixel_x8:ident, $dest_pixel_x8:ident) => {{
547 let twofivefive_8x = _mm256_set1_ps(255.0);
549 let inverse255_8x = _mm256_set1_ps(1.0 / 255.0);
550 let one_8x = _mm256_set1_ps(1.0);
551 let ff_8x = _mm256_set1_epi32(0xFF);
552
553 let src_r = _mm256_cvtepi32_ps(_mm256_and_si256($src_pixel_x8, ff_8x));
555 let src_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 8), ff_8x));
556 let src_b = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 16), ff_8x));
557 let src_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 24), ff_8x));
558
559 let dest_r = _mm256_cvtepi32_ps(_mm256_and_si256($dest_pixel_x8, ff_8x));
560 let dest_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 8), ff_8x));
561 let dest_b = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 16), ff_8x));
562 let dest_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 24), ff_8x));
563
564 let src_linear_r = mm256_square!(_mm256_mul_ps(src_r, inverse255_8x));
566 let src_linear_g = mm256_square!(_mm256_mul_ps(src_g, inverse255_8x));
567 let src_linear_b = mm256_square!(_mm256_mul_ps(src_b, inverse255_8x));
568 let src_linear_a = _mm256_mul_ps(src_a, inverse255_8x); let dest_linear_r = mm256_square!(_mm256_mul_ps(dest_r, inverse255_8x));
571 let dest_linear_g = mm256_square!(_mm256_mul_ps(dest_g, inverse255_8x));
572 let dest_linear_b = mm256_square!(_mm256_mul_ps(dest_b, inverse255_8x));
573 let dest_linear_a = _mm256_mul_ps(dest_a, inverse255_8x); let toward = src_linear_a;
576 let one_minus_toward = _mm256_sub_ps(one_8x, toward);
577
578 let out_r = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_r), _mm256_mul_ps(toward, src_linear_r));
579 let out_g = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_g), _mm256_mul_ps(toward, src_linear_g));
580 let out_b = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_b), _mm256_mul_ps(toward, src_linear_b));
581 let out_a = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_a), _mm256_mul_ps(toward, src_linear_a));
582
583 let out_r_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_r), twofivefive_8x));
585 let out_g_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_g), twofivefive_8x));
586 let out_b_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_b), twofivefive_8x));
587 let out_a_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(out_a, twofivefive_8x));
588
589 let out_xxgr_i32 = _mm256_or_si256(_mm256_slli_epi32(out_g_i32, 8), out_r_i32);
591 let out_abxx_i32 = _mm256_or_si256(_mm256_slli_epi32(out_a_i32, 24), _mm256_slli_epi32(out_b_i32, 16));
592 _mm256_or_si256(out_abxx_i32, out_xxgr_i32)
593 }};
594}
595
596#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
598macro_rules! avx2_finish_off_row_aligned {
599 ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
600 debug_assert_eq!(
601 check_misalign32!($src_row_mid_ptr),
602 0,
603 "avx2_finish_off_row_aligned, the src_row_mid_ptr isn't aligned: {} / {}",
604 check_misalign32!($src_row_mid_ptr),
605 $src_row_mid_ptr as usize
606 );
607 debug_assert_eq!(
608 check_misalign32!($dest_row_mid_ptr),
609 0,
610 "avx2_finish_off_row_aligned, the dest_row_mid_ptr isn't aligned: {} / {}",
611 check_misalign32!($dest_row_mid_ptr),
612 $dest_row_mid_ptr as usize
613 );
614 if check_misalign8!($clip_width as isize - $x as isize) == 0 {
615 while $x < $clip_width {
618 let src_pixel_x8 = _mm256_load_si256($src_row_mid_ptr as *const __m256i);
619 let dest_pixel_x8 = _mm256_load_si256($dest_row_mid_ptr as *const __m256i);
620 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
621 _mm256_store_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
622 $x += AVX_LANE_WIDTH;
623 $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
624 $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
625 }
626 } else {
627 while $x < $clip_width {
631 match $clip_width as isize - $x as isize {
632 1 => {
633 let read_write_mask = _mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0);
634 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
635 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
636 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
637 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
638 }
639 2 => {
640 let read_write_mask = _mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0);
641 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
642 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
643 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
644 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
645 }
646 3 => {
647 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0);
648 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
649 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
650 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
651 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
652 }
653 4 => {
654 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
655 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
656 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
657 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
658 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
659 }
660 5 => {
661 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0);
662 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
663 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
664 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
665 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
666 }
667 6 => {
668 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
669 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
670 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
671 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
672 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
673 }
674 7 => {
675 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0);
676 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
677 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
678 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
679 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
680 }
681 other => {
682 debug_assert!(other >= 8);
683 let src_pixel_x8 = _mm256_load_si256($src_row_mid_ptr as *const __m256i);
684 let dest_pixel_x8 = _mm256_load_si256($dest_row_mid_ptr as *const __m256i);
685 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
686 _mm256_store_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
687 }
688 }
689 $x += AVX_LANE_WIDTH;
690 $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
691 $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
692 }
693 }
694 }};
695}
696
697#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
699macro_rules! avx2_finish_off_row_un_aligned {
700 ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
701 if check_misalign8!($clip_width as isize - $x as isize) == 0 {
702 while $x < $clip_width {
705 let src_pixel_x8 = _mm256_loadu_si256($src_row_mid_ptr as *const __m256i);
706 let dest_pixel_x8 = _mm256_loadu_si256($dest_row_mid_ptr as *const __m256i);
707 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
708 _mm256_storeu_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
709 $x += AVX_LANE_WIDTH;
710 $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
711 $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
712 }
713 } else {
714 while $x < $clip_width {
718 match $clip_width as isize - $x as isize {
719 1 => {
720 let read_write_mask = _mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0);
721 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
722 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
723 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
724 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
725 }
726 2 => {
727 let read_write_mask = _mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0);
728 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
729 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
730 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
731 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
732 }
733 3 => {
734 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0);
735 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
736 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
737 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
738 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
739 }
740 4 => {
741 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
742 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
743 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
744 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
745 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
746 }
747 5 => {
748 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0);
749 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
750 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
751 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
752 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
753 }
754 6 => {
755 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
756 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
757 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
758 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
759 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
760 }
761 7 => {
762 let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0);
763 let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
764 let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
765 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
766 _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
767 }
768 other => {
769 debug_assert!(other >= 8);
770 let src_pixel_x8 = _mm256_loadu_si256($src_row_mid_ptr as *const __m256i);
771 let dest_pixel_x8 = _mm256_loadu_si256($dest_row_mid_ptr as *const __m256i);
772 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
773 _mm256_storeu_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
774 }
775 }
776 $x += AVX_LANE_WIDTH;
777 $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
778 $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
779 }
780 }
781 }};
782}
783
784#[target_feature(enable = "avx2")]
786#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
787unsafe fn blit_blend_rectilinear_avx2_explicit<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
788where
789 WI: WritableImage<u32> + ?Sized,
790 RI: ReadableImage<u32>,
791{
792 let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
793 determine_overlay!(dest, src, offset);
794
795 if clip_width > 0 && clip_height > 0 {
796 let src_pitch = src.pitch();
797 let dest_pitch = dest.pitch();
798 let mut y = 0;
799 while y < clip_height {
800 let mut x = 0;
801 let mut src_row_mid_ptr = src_row_start_ptr as *const i32;
802 let mut dest_row_mid_ptr = dest_row_start_ptr as *mut i32;
803 let src_misalign = check_misalign32!(src_row_mid_ptr);
804 let dest_misalign = check_misalign32!(dest_row_mid_ptr);
805 if src_misalign > 0 || dest_misalign > 0 {
806 if src_misalign == dest_misalign {
808 let (read_write_mask, pixel_jump) = match src_misalign {
811 28 => (_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0), 1),
812 24 => (_mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0), 2),
813 20 => (_mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0), 3),
814 16 => (_mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0), 4),
815 12 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0), 5),
816 8 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0), 6),
817 4 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0), 7),
818 other => panic!("invalid src_misalign value: {}", other),
819 };
820 let src_pixel_x8 = _mm256_maskload_epi32(src_row_mid_ptr, read_write_mask);
821 let dest_pixel_x8 = _mm256_maskload_epi32(dest_row_mid_ptr, read_write_mask);
822 let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
823 _mm256_maskstore_epi32(dest_row_mid_ptr, read_write_mask, out_packed_x8);
824 src_row_mid_ptr = src_row_mid_ptr.offset(pixel_jump);
825 dest_row_mid_ptr = dest_row_mid_ptr.offset(pixel_jump);
826 x += pixel_jump as usize;
827 avx2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
829 } else {
830 avx2_finish_off_row_un_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
833 }
834 } else {
835 avx2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
837 }
838 y += 1;
839 src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
840 dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
841 }
842 }
843}