retro_pixel/
u32_ext.rs

1//! Module for code specific to `u32` base images.
2//!
3//! The channel order of the pixel value processing functions here is assumed to
4//! be `AABBGGRR`, which is what OpenGL uses. Note that other graphical systems
5//! might use 32 bit color but with the same channel order (notably, windows
6//! GDI), so for those situations you'd have to shuffle the channels or
7//! something.
8//!
9//! In the future, I will attempt to make this able to unpack, use, and repack
10//! other color channel orderings.
11
12#![cfg(target_endian = "little")]
13#![allow(dead_code)]
14#![allow(unused_macros)]
15
16use super::*;
17
18// THE PROBLEM
19//
20// We want to be able to load aligned whenever possible. However, there are
21// complications. The first is that the pointers might not be aligned. If they
22// are not aligned we _might_ be able to do some work unaligned, and then after
23// that work aligned for the rest of the row. We can only do this if they're
24// both not aligned by the same amount. Also, the pitch values for each image
25// might not be clean multiples, causing us to go in and out of alignment with
26// each row.
27//
28// The answer? MACROS. Piles of them.
29
30/// Converts an RGBA `u32` into a pseudo-linear brightness float array.
31///
32/// The RGB channels are squared, instead of using the normal 2.2 exponent.
33#[inline(always)]
34pub fn u32_to_linear(pixel: u32) -> [f32; 4] {
35  // ENHANCE: tests
36  let r = square(((pixel & 0xFF) as f32) / 255.0);
37  let g = square(((pixel >> 8 & 0xFF) as f32) / 255.0);
38  let b = square(((pixel >> 16 & 0xFF) as f32) / 255.0);
39  let a = ((pixel >> 24 & 0xFF) as f32) / 255.0;
40  [r, g, b, a]
41}
42
43/// The inverse of `u32_to_linear`.
44///
45/// The RGB channels are converted into sRGB using sqrt instead of the normal
46/// -2.2 exponent. Each input channel should be in the `[0.0, 1.0]` range, out
47/// of range values will give you very strange results.
48#[inline(always)]
49pub fn linear_to_u32(linear: [f32; 4]) -> u32 {
50  // ENHANCE: tests
51  let r = (sqrt(linear[0]) * 255.0 + 0.5) as u32;
52  let g = (sqrt(linear[1]) * 255.0 + 0.5) as u32;
53  let b = (sqrt(linear[2]) * 255.0 + 0.5) as u32;
54  let a = (linear[3] * 255.0 + 0.5) as u32;
55  rgba32!(r, g, b, a)
56}
57
58/// Extra functionality that's only available to images of u32 values.
59///
60/// Uses `0xAABBGGRR` channel ordering.
61pub trait WritableImageU32Ext: WritableImage<u32> {
62  /// Performs a rectilinear blending blit at an integral pixel offset.
63  ///
64  /// Similar to the `WritableIndexmap::blit_generic` method, you can provide
65  /// any offsets you like (even negative ones) and the affected area will be
66  /// automatically clipped to be in bounds.
67  ///
68  /// This method will use "avx2" or "sse2" if available. If you compile with
69  /// `std` it will select the best available version at runtime (the most
70  /// portable binary). If you compile with `no_std` it will pick the best
71  /// available version available given the compilation settings and simply
72  /// crash if you move the binary to a less capable machine.
73  fn blit_blend_rectilinear<RI>(&mut self, src: &RI, offset: (isize, isize))
74  where
75    RI: ReadableImage<u32>,
76  {
77    // /*
78    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
79    {
80      // We're on x86 or x86_64, so we'll use explicit SIMD versions as
81      // appropriate, because the compiler just isn't smart enough to unroll it
82      // by hand.
83      #[cfg(feature = "std")]
84      {
85        if is_x86_feature_detected!("avx2") {
86          unsafe { blit_blend_rectilinear_avx2_explicit(self, src, offset) };
87        } else if is_x86_feature_detected!("sse2") {
88          unsafe { blit_blend_rectilinear_sse2_explicit(self, src, offset) };
89        } else {
90          // holy cripes how old is your CPU? these were added to x86 in 2001!
91          unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
92        }
93      }
94      #[cfg(all(not(feature = "std"), target_feature = "avx2"))]
95      {
96        unsafe { blit_blend_rectilinear_avx2_explicit(self, src, offset) };
97      }
98      #[cfg(all(not(feature = "std"), not(target_feature = "avx2"), target_feature = "sse2"))]
99      {
100        unsafe { blit_blend_rectilinear_sse2_explicit(self, src, offset) };
101      }
102      #[cfg(all(not(feature = "std"), not(target_feature = "avx2"), not(target_feature = "sse2")))]
103      {
104        unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
105      }
106    }
107    // */
108    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
109    {
110      // We're NOT on x86 or x86_64, so we just do it using a fully unrolled
111      // loop, which is faster than using blit_generic at least.
112      unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
113    }
114  }
115}
116
117/// This is what the blit_blend_rectilinear looks like when you fully unroll all
118/// of the work.
119///
120/// We do this if we can't do a hand tuned version.
121unsafe fn blit_blend_rectilinear_fully_unrolled_no_intrinsics<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
122where
123  WI: WritableImage<u32> + ?Sized,
124  RI: ReadableImage<u32>,
125{
126  let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
127    determine_overlay!(dest, src, offset);
128  if clip_width > 0 && clip_height > 0 {
129    let src_pitch = src.pitch();
130    let dest_pitch = dest.pitch();
131    let mut y = 0;
132    while y < clip_height {
133      let mut x = 0;
134      let mut src_row_mid_ptr = src_row_start_ptr;
135      let mut dest_row_mid_ptr = dest_row_start_ptr;
136      while x < clip_width {
137        // If we aren't using sse2 or avx2 it's likely because we're on an ARM
138        // processor. Using NEON might still be available in that case, we just
139        // can't hand code that, given the current state of rust's stdsimd
140        // library. Given this, we'll still write the process out as being 4
141        // lanes at a time, and we can hope that LLVM will maybe kinda see what
142        // we're doing if the user compiles with `target-cpu=native`.
143        let mut src_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
144        let mut src_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
145        let mut src_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
146        let mut src_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
147        let mut dest_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
148        let mut dest_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
149        let mut dest_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
150        let mut dest_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
151        let mut out_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
152        let mut out_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
153        let mut out_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
154        let mut out_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
155        // load
156        for lane in 0..SSE_LANE_WIDTH {
157          const INV255: f32 = 1.0 / 255.0;
158          let lane_i = lane as isize;
159          if x + lane < clip_width {
160            let src_pixel = src_row_mid_ptr.offset(lane_i);
161            src_r[lane] = square((*src_pixel & 0xFF) as f32 * INV255);
162            src_g[lane] = square((*src_pixel >> 8 & 0xFF) as f32 * INV255);
163            src_b[lane] = square((*src_pixel >> 16 & 0xFF) as f32 * INV255);
164            src_a[lane] = (*src_pixel >> 24 & 0xFF) as f32 * INV255;
165            //
166            let dest_pixel = dest_row_mid_ptr.offset(lane_i);
167            dest_r[lane] = square((*dest_pixel & 0xFF) as f32 * INV255);
168            dest_g[lane] = square((*dest_pixel >> 8 & 0xFF) as f32 * INV255);
169            dest_b[lane] = square((*dest_pixel >> 16 & 0xFF) as f32 * INV255);
170            dest_a[lane] = (*dest_pixel >> 24 & 0xFF) as f32 * INV255;
171          } else {
172            break;
173          }
174        }
175        // work
176        for lane in 0..SSE_LANE_WIDTH {
177          let toward = src_a[lane];
178          let one_minus_toward = 1.0 - toward;
179          out_r[lane] = one_minus_toward * dest_r[lane] + toward * src_r[lane];
180          out_g[lane] = one_minus_toward * dest_g[lane] + toward * src_g[lane];
181          out_b[lane] = one_minus_toward * dest_b[lane] + toward * src_b[lane];
182          out_a[lane] = one_minus_toward * dest_a[lane] + toward * src_a[lane];
183        }
184        // store results
185        for lane in 0..SSE_LANE_WIDTH {
186          let lane_i = lane as isize;
187          if x + lane < clip_width {
188            let out32 = (((out_a[lane] * 255.0 + 0.5) as u32) << 24) | (((sqrt(out_b[lane]) * 255.0 + 0.5) as u32) << 16)
189              | (((sqrt(out_g[lane]) * 255.0 + 0.5) as u32) << 8) | ((sqrt(out_r[lane]) * 255.0 + 0.5) as u32);
190            *dest_row_mid_ptr.offset(lane_i) = out32;
191          } else {
192            break;
193          }
194        }
195        x += SSE_LANE_WIDTH;
196        src_row_mid_ptr = src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
197        dest_row_mid_ptr = dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
198      }
199      y += 1;
200      src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
201      dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
202    }
203  }
204}
205
206//
207//
208// SSE2
209//
210//
211
212// TODO: doc-tests, docs
213#[allow(unused_macros)]
214macro_rules! m128_f32 {
215  ($a:ident, $i:expr) => {
216    *(((&mut $a) as *mut __m128) as *mut f32).offset($i as isize)
217  };
218}
219
220// TODO: doc-tests, docs
221#[allow(unused_macros)]
222macro_rules! m128i_i32 {
223  ($a:ident, $i:expr) => {
224    *(((&mut $a) as *mut __m128i) as *mut i32).offset($i as isize)
225  };
226}
227
228// TODO: doc-tests, docs
229#[allow(unused_macros)]
230macro_rules! mm_square {
231  ($reg:expr) => {
232    _mm_mul_ps($reg, $reg)
233  };
234}
235
236// TODO: doc-tests, docs
237#[allow(unused_macros)]
238macro_rules! print_128 {
239  ($reg:ident,i) => {{
240    let arr = (&$reg as *const __m128i as *const [u32; 4]).as_ref().unwrap();
241    println!("{}: i[{:08X},{:08X},{:08X},{:08X}]", stringify!($reg), arr[0], arr[1], arr[2], arr[3]);
242  }};
243  ($reg:ident,f) => {{
244    let arr = (&$reg as *const __m128 as *const [f32; 4]).as_ref().unwrap();
245    println!("{}: f[{:7.1},{:7.1},{:7.1},{:7.1}]", stringify!($reg), arr[0], arr[1], arr[2], arr[3]);
246  }};
247}
248
249// TODO: doc-tests, docs
250#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
251macro_rules! sse2_do_pixel_work {
252  ($src_pixel_x4:ident, $dest_pixel_x4:ident) => {{
253    let twofivefive_4x = _mm_set1_ps(255.0);
254    let inverse255_4x = _mm_set1_ps(1.0 / 255.0);
255    let one_4x = _mm_set1_ps(1.0);
256    let ff_4x = _mm_set1_epi32(0xFF);
257
258    // unpack into channels
259    let src_r = _mm_cvtepi32_ps(_mm_and_si128($src_pixel_x4, ff_4x));
260    let src_g = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 8), ff_4x));
261    let src_b = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 16), ff_4x));
262    let src_a = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 24), ff_4x));
263
264    let dest_r = _mm_cvtepi32_ps(_mm_and_si128($dest_pixel_x4, ff_4x));
265    let dest_g = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 8), ff_4x));
266    let dest_b = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 16), ff_4x));
267    let dest_a = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 24), ff_4x));
268
269    // do work in SIMD
270    let src_linear_r = mm_square!(_mm_mul_ps(src_r, inverse255_4x));
271    let src_linear_g = mm_square!(_mm_mul_ps(src_g, inverse255_4x));
272    let src_linear_b = mm_square!(_mm_mul_ps(src_b, inverse255_4x));
273    let src_linear_a = _mm_mul_ps(src_a, inverse255_4x); // do not square alpha
274
275    let dest_linear_r = mm_square!(_mm_mul_ps(dest_r, inverse255_4x));
276    let dest_linear_g = mm_square!(_mm_mul_ps(dest_g, inverse255_4x));
277    let dest_linear_b = mm_square!(_mm_mul_ps(dest_b, inverse255_4x));
278    let dest_linear_a = _mm_mul_ps(dest_a, inverse255_4x); // do not square alpha
279
280    let toward = src_linear_a;
281    let one_minus_toward = _mm_sub_ps(one_4x, toward);
282
283    let out_r = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_r), _mm_mul_ps(toward, src_linear_r));
284    let out_g = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_g), _mm_mul_ps(toward, src_linear_g));
285    let out_b = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_b), _mm_mul_ps(toward, src_linear_b));
286    let out_a = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_a), _mm_mul_ps(toward, src_linear_a));
287
288    // remove linear status
289    let out_r_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_r), twofivefive_4x));
290    let out_g_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_g), twofivefive_4x));
291    let out_b_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_b), twofivefive_4x));
292    let out_a_i32 = _mm_cvtps_epi32(_mm_mul_ps(out_a, twofivefive_4x));
293
294    // pack up the results
295    let out_xxgr_i32 = _mm_or_si128(_mm_slli_epi32(out_g_i32, 8), out_r_i32);
296    let out_abxx_i32 = _mm_or_si128(_mm_slli_epi32(out_a_i32, 24), _mm_slli_epi32(out_b_i32, 16));
297    _mm_or_si128(out_abxx_i32, out_xxgr_i32)
298  }};
299}
300
301// TODO: doc-tests, docs
302#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
303macro_rules! sse2_finish_off_row_aligned {
304  ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
305    debug_assert_eq!(
306      check_misalign16!($src_row_mid_ptr),
307      0,
308      "sse2_finish_off_row_aligned, the src_row_mid_ptr isn't aligned: {} / {}",
309      check_misalign16!($src_row_mid_ptr),
310      $src_row_mid_ptr as usize
311    );
312    debug_assert_eq!(
313      check_misalign16!($dest_row_mid_ptr),
314      0,
315      "sse2_finish_off_row_aligned, the dest_row_mid_ptr isn't aligned: {} / {}",
316      check_misalign16!($dest_row_mid_ptr),
317      $dest_row_mid_ptr as usize
318    );
319    if check_misalign4!($clip_width as isize - $x as isize) == 0 {
320      // we'll _always_ be working 4 lanes at a time for the rest of this
321      // row, maximum speed!
322      while $x < $clip_width {
323        let src_pixel_x4 = _mm_load_si128($src_row_mid_ptr as *const __m128i);
324        let dest_pixel_x4 = _mm_load_si128($dest_row_mid_ptr as *const __m128i);
325        let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
326        _mm_store_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
327        $x += SSE_LANE_WIDTH;
328        $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
329        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
330      }
331    } else {
332      // we'll have to do less than 4 lanes for the final pass of this
333      // row. Still, we can at least do aligned loads for all the
334      // "complete" passes.
335      while $x < $clip_width {
336        match $clip_width as isize - $x as isize {
337          1 => {
338            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, 0, 0, 0);
339            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, 0, 0, 0);
340            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
341            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
342          }
343          2 => {
344            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), 0, 0);
345            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), 0, 0);
346            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
347            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
348            *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
349          }
350          3 => {
351            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), *$src_row_mid_ptr.offset(2), 0);
352            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), *$dest_row_mid_ptr.offset(2), 0);
353            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
354            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
355            *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
356            *$dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
357          }
358          other => {
359            debug_assert!(other >= 4);
360            let src_pixel_x4 = _mm_load_si128($src_row_mid_ptr as *const __m128i);
361            let dest_pixel_x4 = _mm_load_si128($dest_row_mid_ptr as *const __m128i);
362            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
363            _mm_store_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
364          }
365        }
366        $x += SSE_LANE_WIDTH;
367        $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
368        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
369      }
370    }
371  }};
372}
373
374// TODO: doc-tests, docs
375#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
376macro_rules! sse2_finish_off_row_un_aligned {
377  ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
378    if check_misalign4!($clip_width as isize - $x as isize) == 0 {
379      // we'll _always_ be working 4 lanes at a time for the rest of this
380      // row, maximum speed!
381      while $x < $clip_width {
382        let src_pixel_x4 = _mm_loadu_si128($src_row_mid_ptr as *const __m128i);
383        let dest_pixel_x4 = _mm_loadu_si128($dest_row_mid_ptr as *const __m128i);
384        let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
385        _mm_storeu_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
386        $x += SSE_LANE_WIDTH;
387        $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
388        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
389      }
390    } else {
391      // we'll have to do less than 4 lanes for the final pass of this
392      // row. Still, we can at least do aligned loads for all the
393      // "complete" passes.
394      while $x < $clip_width {
395        match $clip_width as isize - $x as isize {
396          1 => {
397            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, 0, 0, 0);
398            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, 0, 0, 0);
399            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
400            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
401          }
402          2 => {
403            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), 0, 0);
404            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), 0, 0);
405            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
406            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
407            *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
408          }
409          3 => {
410            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), *$src_row_mid_ptr.offset(2), 0);
411            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), *$dest_row_mid_ptr.offset(2), 0);
412            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
413            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
414            *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
415            *$dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
416          }
417          other => {
418            debug_assert!(other >= 4);
419            let src_pixel_x4 = _mm_loadu_si128($src_row_mid_ptr as *const __m128i);
420            let dest_pixel_x4 = _mm_loadu_si128($dest_row_mid_ptr as *const __m128i);
421            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
422            _mm_storeu_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
423          }
424        }
425        $x += SSE_LANE_WIDTH;
426        $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
427        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
428      }
429    }
430  }};
431}
432
433/// Calls the SSE2 enabled version of the `blit_rectilinear` operation. That
434/// function doesn't have "sse2" enabled, but this one does, and so when that
435/// one gets inlined to here it'll get inlined into an "sse2" form. Sounds
436/// silly, but that's the suggested way in the std simd docs.
437#[target_feature(enable = "sse2")]
438#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
439unsafe fn blit_blend_rectilinear_sse2_explicit<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
440where
441  WI: WritableImage<u32> + ?Sized,
442  RI: ReadableImage<u32>,
443{
444  let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
445    determine_overlay!(dest, src, offset);
446
447  if clip_width > 0 && clip_height > 0 {
448    let src_pitch = src.pitch();
449    let dest_pitch = dest.pitch();
450    let mut y = 0;
451    while y < clip_height {
452      let mut x = 0;
453      let mut src_row_mid_ptr = src_row_start_ptr as *const i32;
454      let mut dest_row_mid_ptr = dest_row_start_ptr as *mut i32;
455      let src_misalign = check_misalign16!(src_row_mid_ptr);
456      let dest_misalign = check_misalign16!(dest_row_mid_ptr);
457      if src_misalign > 0 || dest_misalign > 0 {
458        // we're somehow off alignment.
459        if src_misalign == dest_misalign {
460          // both are mis-aligned, but in phase with each other, so we start by
461          // stepping forward 1-3 pixels so that we're at an aligned point.
462          match src_misalign {
463            12 => {
464              // jump forward 1 pixel
465              let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, 0, 0, 0);
466              let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, 0, 0, 0);
467              let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
468              *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
469              src_row_mid_ptr = src_row_mid_ptr.offset(1);
470              dest_row_mid_ptr = dest_row_mid_ptr.offset(1);
471              x += 1;
472            }
473            8 => {
474              // jump forward 2 pixels
475              let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, *src_row_mid_ptr.offset(1), 0, 0);
476              let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, *dest_row_mid_ptr.offset(1), 0, 0);
477              let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
478              *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
479              *dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
480              src_row_mid_ptr = src_row_mid_ptr.offset(2);
481              dest_row_mid_ptr = dest_row_mid_ptr.offset(2);
482              x += 2;
483            }
484            4 => {
485              // jump forward 3 pixels
486              let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, *src_row_mid_ptr.offset(1), *src_row_mid_ptr.offset(2), 0);
487              let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, *dest_row_mid_ptr.offset(1), *dest_row_mid_ptr.offset(2), 0);
488              let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
489              *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
490              *dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
491              *dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
492              src_row_mid_ptr = src_row_mid_ptr.offset(3);
493              dest_row_mid_ptr = dest_row_mid_ptr.offset(3);
494              x += 3;
495            }
496            other => panic!("invalid src_misalign value: {}", other),
497          };
498          // Now we finish off the rest of the row fully aligned.
499          sse2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
500        } else {
501          // either of them is mis-aligned, but they're out of phase with each
502          // other, so we have no hope to get them both aligned during this row.
503          sse2_finish_off_row_un_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
504        }
505      } else {
506        // both pointers are totally aligned without doing a startup set.
507        sse2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
508      }
509      y += 1;
510      src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
511      dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
512    }
513  }
514}
515
516//
517//
518// AVX
519//
520//
521
522// TODO: doc-tests, docs
523macro_rules! m256_f32 {
524  ($a:ident, $i:expr) => {
525    *(((&mut $a) as *mut __m256) as *mut f32).offset($i as isize)
526  };
527}
528
529// TODO: doc-tests, docs
530macro_rules! m256i_i32 {
531  ($a:ident, $i:expr) => {
532    *(((&mut $a) as *mut __m256i) as *mut i32).offset($i as isize)
533  };
534}
535
536// TODO: doc-tests, docs
537macro_rules! mm256_square {
538  ($reg:expr) => {
539    _mm256_mul_ps($reg, $reg)
540  };
541}
542
543// TODO: doc-tests, docs
544#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
545macro_rules! avx2_do_pixel_work {
546  ($src_pixel_x8:ident, $dest_pixel_x8:ident) => {{
547    // Convert from sRGB packed u32 to linear 0.0 to 1.0
548    let twofivefive_8x = _mm256_set1_ps(255.0);
549    let inverse255_8x = _mm256_set1_ps(1.0 / 255.0);
550    let one_8x = _mm256_set1_ps(1.0);
551    let ff_8x = _mm256_set1_epi32(0xFF);
552
553    // unpack into channels
554    let src_r = _mm256_cvtepi32_ps(_mm256_and_si256($src_pixel_x8, ff_8x));
555    let src_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 8), ff_8x));
556    let src_b = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 16), ff_8x));
557    let src_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 24), ff_8x));
558
559    let dest_r = _mm256_cvtepi32_ps(_mm256_and_si256($dest_pixel_x8, ff_8x));
560    let dest_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 8), ff_8x));
561    let dest_b = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 16), ff_8x));
562    let dest_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 24), ff_8x));
563
564    // do work in SIMD
565    let src_linear_r = mm256_square!(_mm256_mul_ps(src_r, inverse255_8x));
566    let src_linear_g = mm256_square!(_mm256_mul_ps(src_g, inverse255_8x));
567    let src_linear_b = mm256_square!(_mm256_mul_ps(src_b, inverse255_8x));
568    let src_linear_a = _mm256_mul_ps(src_a, inverse255_8x); // do not square alpha
569
570    let dest_linear_r = mm256_square!(_mm256_mul_ps(dest_r, inverse255_8x));
571    let dest_linear_g = mm256_square!(_mm256_mul_ps(dest_g, inverse255_8x));
572    let dest_linear_b = mm256_square!(_mm256_mul_ps(dest_b, inverse255_8x));
573    let dest_linear_a = _mm256_mul_ps(dest_a, inverse255_8x); // do not square alpha
574
575    let toward = src_linear_a;
576    let one_minus_toward = _mm256_sub_ps(one_8x, toward);
577
578    let out_r = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_r), _mm256_mul_ps(toward, src_linear_r));
579    let out_g = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_g), _mm256_mul_ps(toward, src_linear_g));
580    let out_b = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_b), _mm256_mul_ps(toward, src_linear_b));
581    let out_a = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_a), _mm256_mul_ps(toward, src_linear_a));
582
583    // remove linear status
584    let out_r_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_r), twofivefive_8x));
585    let out_g_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_g), twofivefive_8x));
586    let out_b_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_b), twofivefive_8x));
587    let out_a_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(out_a, twofivefive_8x));
588
589    // pack up the results
590    let out_xxgr_i32 = _mm256_or_si256(_mm256_slli_epi32(out_g_i32, 8), out_r_i32);
591    let out_abxx_i32 = _mm256_or_si256(_mm256_slli_epi32(out_a_i32, 24), _mm256_slli_epi32(out_b_i32, 16));
592    _mm256_or_si256(out_abxx_i32, out_xxgr_i32)
593  }};
594}
595
596// TODO: doc-tests, docs
597#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
598macro_rules! avx2_finish_off_row_aligned {
599  ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
600    debug_assert_eq!(
601      check_misalign32!($src_row_mid_ptr),
602      0,
603      "avx2_finish_off_row_aligned, the src_row_mid_ptr isn't aligned: {} / {}",
604      check_misalign32!($src_row_mid_ptr),
605      $src_row_mid_ptr as usize
606    );
607    debug_assert_eq!(
608      check_misalign32!($dest_row_mid_ptr),
609      0,
610      "avx2_finish_off_row_aligned, the dest_row_mid_ptr isn't aligned: {} / {}",
611      check_misalign32!($dest_row_mid_ptr),
612      $dest_row_mid_ptr as usize
613    );
614    if check_misalign8!($clip_width as isize - $x as isize) == 0 {
615      // we'll _always_ be working 8 lanes at a time for the rest of this
616      // row, maximum speed!
617      while $x < $clip_width {
618        let src_pixel_x8 = _mm256_load_si256($src_row_mid_ptr as *const __m256i);
619        let dest_pixel_x8 = _mm256_load_si256($dest_row_mid_ptr as *const __m256i);
620        let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
621        _mm256_store_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
622        $x += AVX_LANE_WIDTH;
623        $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
624        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
625      }
626    } else {
627      // we'll have to do less than 4 lanes for the final pass of this
628      // row. Still, we can at least do aligned loads for all the
629      // "complete" passes.
630      while $x < $clip_width {
631        match $clip_width as isize - $x as isize {
632          1 => {
633            let read_write_mask = _mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0);
634            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
635            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
636            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
637            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
638          }
639          2 => {
640            let read_write_mask = _mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0);
641            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
642            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
643            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
644            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
645          }
646          3 => {
647            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0);
648            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
649            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
650            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
651            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
652          }
653          4 => {
654            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
655            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
656            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
657            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
658            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
659          }
660          5 => {
661            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0);
662            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
663            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
664            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
665            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
666          }
667          6 => {
668            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
669            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
670            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
671            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
672            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
673          }
674          7 => {
675            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0);
676            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
677            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
678            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
679            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
680          }
681          other => {
682            debug_assert!(other >= 8);
683            let src_pixel_x8 = _mm256_load_si256($src_row_mid_ptr as *const __m256i);
684            let dest_pixel_x8 = _mm256_load_si256($dest_row_mid_ptr as *const __m256i);
685            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
686            _mm256_store_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
687          }
688        }
689        $x += AVX_LANE_WIDTH;
690        $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
691        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
692      }
693    }
694  }};
695}
696
697// TODO: doc-tests, docs
698#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
699macro_rules! avx2_finish_off_row_un_aligned {
700  ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
701    if check_misalign8!($clip_width as isize - $x as isize) == 0 {
702      // we'll _always_ be working 8 lanes at a time for the rest of this
703      // row, maximum speed!
704      while $x < $clip_width {
705        let src_pixel_x8 = _mm256_loadu_si256($src_row_mid_ptr as *const __m256i);
706        let dest_pixel_x8 = _mm256_loadu_si256($dest_row_mid_ptr as *const __m256i);
707        let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
708        _mm256_storeu_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
709        $x += AVX_LANE_WIDTH;
710        $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
711        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
712      }
713    } else {
714      // we'll have to do less than 4 lanes for the final pass of this
715      // row. Still, we can at least do aligned loads for all the
716      // "complete" passes.
717      while $x < $clip_width {
718        match $clip_width as isize - $x as isize {
719          1 => {
720            let read_write_mask = _mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0);
721            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
722            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
723            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
724            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
725          }
726          2 => {
727            let read_write_mask = _mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0);
728            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
729            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
730            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
731            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
732          }
733          3 => {
734            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0);
735            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
736            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
737            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
738            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
739          }
740          4 => {
741            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
742            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
743            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
744            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
745            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
746          }
747          5 => {
748            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0);
749            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
750            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
751            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
752            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
753          }
754          6 => {
755            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
756            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
757            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
758            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
759            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
760          }
761          7 => {
762            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0);
763            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
764            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
765            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
766            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
767          }
768          other => {
769            debug_assert!(other >= 8);
770            let src_pixel_x8 = _mm256_loadu_si256($src_row_mid_ptr as *const __m256i);
771            let dest_pixel_x8 = _mm256_loadu_si256($dest_row_mid_ptr as *const __m256i);
772            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
773            _mm256_storeu_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
774          }
775        }
776        $x += AVX_LANE_WIDTH;
777        $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
778        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
779      }
780    }
781  }};
782}
783
784// TODO: doc-tests, docs
785#[target_feature(enable = "avx2")]
786#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
787unsafe fn blit_blend_rectilinear_avx2_explicit<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
788where
789  WI: WritableImage<u32> + ?Sized,
790  RI: ReadableImage<u32>,
791{
792  let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
793    determine_overlay!(dest, src, offset);
794
795  if clip_width > 0 && clip_height > 0 {
796    let src_pitch = src.pitch();
797    let dest_pitch = dest.pitch();
798    let mut y = 0;
799    while y < clip_height {
800      let mut x = 0;
801      let mut src_row_mid_ptr = src_row_start_ptr as *const i32;
802      let mut dest_row_mid_ptr = dest_row_start_ptr as *mut i32;
803      let src_misalign = check_misalign32!(src_row_mid_ptr);
804      let dest_misalign = check_misalign32!(dest_row_mid_ptr);
805      if src_misalign > 0 || dest_misalign > 0 {
806        // we're somehow off alignment.
807        if src_misalign == dest_misalign {
808          // both are mis-aligned, but in phase with each other. We'll process a
809          // few pixels so that we can have both be aligned.
810          let (read_write_mask, pixel_jump) = match src_misalign {
811            28 => (_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0), 1),
812            24 => (_mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0), 2),
813            20 => (_mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0), 3),
814            16 => (_mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0), 4),
815            12 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0), 5),
816            8 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0), 6),
817            4 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0), 7),
818            other => panic!("invalid src_misalign value: {}", other),
819          };
820          let src_pixel_x8 = _mm256_maskload_epi32(src_row_mid_ptr, read_write_mask);
821          let dest_pixel_x8 = _mm256_maskload_epi32(dest_row_mid_ptr, read_write_mask);
822          let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
823          _mm256_maskstore_epi32(dest_row_mid_ptr, read_write_mask, out_packed_x8);
824          src_row_mid_ptr = src_row_mid_ptr.offset(pixel_jump);
825          dest_row_mid_ptr = dest_row_mid_ptr.offset(pixel_jump);
826          x += pixel_jump as usize;
827          // Now we finish off the rest of the row fully aligned.
828          avx2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
829        } else {
830          // either of them is mis-aligned, but they're out of phase with each
831          // other, so we have no hope to get them both aligned during this row.
832          avx2_finish_off_row_un_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
833        }
834      } else {
835        // both pointers are totally aligned without doing a startup set.
836        avx2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
837      }
838      y += 1;
839      src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
840      dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
841    }
842  }
843}