//! Module for code specific to `u32` base images.
//!
//! The channel order of the pixel value processing functions here is assumed to
//! be `AABBGGRR`, which is what OpenGL uses. Note that other graphical systems
//! might use 32 bit color but with the same channel order (notably, windows
//! GDI), so for those situations you'd have to shuffle the channels or
//! something.
//!
//! In the future, I will attempt to make this able to unpack, use, and repack
//! other color channel orderings.

#![cfg(target_endian = "little")]
#![allow(dead_code)]
#![allow(unused_macros)]

use super::*;

// THE PROBLEM
//
// We want to be able to load aligned whenever possible. However, there are
// complications. The first is that the pointers might not be aligned. If they
// are not aligned we _might_ be able to do some work unaligned, and then after
// that work aligned for the rest of the row. We can only do this if they're
// both not aligned by the same amount. Also, the pitch values for each image
// might not be clean multiples, causing us to go in and out of alignment with
// each row.
//
// The answer? MACROS. Piles of them.

/// Converts an RGBA `u32` into a pseudo-linear brightness float array.
///
/// The RGB channels are squared, instead of using the normal 2.2 exponent.
#[inline(always)]
pub fn u32_to_linear(pixel: u32) -> [f32; 4] {
  // ENHANCE: tests
  let r = square(((pixel & 0xFF) as f32) / 255.0);
  let g = square(((pixel >> 8 & 0xFF) as f32) / 255.0);
  let b = square(((pixel >> 16 & 0xFF) as f32) / 255.0);
  let a = ((pixel >> 24 & 0xFF) as f32) / 255.0;
  [r, g, b, a]
}

/// The inverse of `u32_to_linear`.
///
/// The RGB channels are converted into sRGB using sqrt instead of the normal
/// -2.2 exponent. Each input channel should be in the `[0.0, 1.0]` range, out
/// of range values will give you very strange results.
#[inline(always)]
pub fn linear_to_u32(linear: [f32; 4]) -> u32 {
  // ENHANCE: tests
  let r = (sqrt(linear[0]) * 255.0 + 0.5) as u32;
  let g = (sqrt(linear[1]) * 255.0 + 0.5) as u32;
  let b = (sqrt(linear[2]) * 255.0 + 0.5) as u32;
  let a = (linear[3] * 255.0 + 0.5) as u32;
  rgba32!(r, g, b, a)
}

/// Extra functionality that's only available to images of u32 values.
///
/// Uses `0xAABBGGRR` channel ordering.
pub trait WritableImageU32Ext: WritableImage<u32> {
  /// Performs a rectilinear blending blit at an integral pixel offset.
  ///
  /// Similar to the `WritableIndexmap::blit_generic` method, you can provide
  /// any offsets you like (even negative ones) and the affected area will be
  /// automatically clipped to be in bounds.
  ///
  /// This method will use "avx2" or "sse2" if available. If you compile with
  /// `std` it will select the best available version at runtime (the most
  /// portable binary). If you compile with `no_std` it will pick the best
  /// available version available given the compilation settings and simply
  /// crash if you move the binary to a less capable machine.
  fn blit_blend_rectilinear<RI>(&mut self, src: &RI, offset: (isize, isize))
  where
    RI: ReadableImage<u32>,
  {
    // /*
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    {
      // We're on x86 or x86_64, so we'll use explicit SIMD versions as
      // appropriate, because the compiler just isn't smart enough to unroll it
      // by hand.
      #[cfg(feature = "std")]
      {
        if is_x86_feature_detected!("avx2") {
          unsafe { blit_blend_rectilinear_avx2_explicit(self, src, offset) };
        } else if is_x86_feature_detected!("sse2") {
          unsafe { blit_blend_rectilinear_sse2_explicit(self, src, offset) };
        } else {
          // holy cripes how old is your CPU? these were added to x86 in 2001!
          unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
        }
      }
      #[cfg(all(not(feature = "std"), target_feature = "avx2"))]
      {
        unsafe { blit_blend_rectilinear_avx2_explicit(self, src, offset) };
      }
      #[cfg(all(not(feature = "std"), not(target_feature = "avx2"), target_feature = "sse2"))]
      {
        unsafe { blit_blend_rectilinear_sse2_explicit(self, src, offset) };
      }
      #[cfg(all(not(feature = "std"), not(target_feature = "avx2"), not(target_feature = "sse2")))]
      {
        unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
      }
    }
    // */
    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
    {
      // We're NOT on x86 or x86_64, so we just do it using a fully unrolled
      // loop, which is faster than using blit_generic at least.
      unsafe { blit_blend_rectilinear_fully_unrolled_no_intrinsics(self, src, offset) };
    }
  }
}

/// This is what the blit_blend_rectilinear looks like when you fully unroll all
/// of the work.
///
/// We do this if we can't do a hand tuned version.
unsafe fn blit_blend_rectilinear_fully_unrolled_no_intrinsics<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
where
  WI: WritableImage<u32> + ?Sized,
  RI: ReadableImage<u32>,
{
  let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
    determine_overlay!(dest, src, offset);
  if clip_width > 0 && clip_height > 0 {
    let src_pitch = src.pitch();
    let dest_pitch = dest.pitch();
    let mut y = 0;
    while y < clip_height {
      let mut x = 0;
      let mut src_row_mid_ptr = src_row_start_ptr;
      let mut dest_row_mid_ptr = dest_row_start_ptr;
      while x < clip_width {
        // If we aren't using sse2 or avx2 it's likely because we're on an ARM
        // processor. Using NEON might still be available in that case, we just
        // can't hand code that, given the current state of rust's stdsimd
        // library. Given this, we'll still write the process out as being 4
        // lanes at a time, and we can hope that LLVM will maybe kinda see what
        // we're doing if the user compiles with `target-cpu=native`.
        let mut src_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut src_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut src_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut src_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut dest_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut dest_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut dest_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut dest_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut out_r: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut out_g: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut out_b: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        let mut out_a: [f32; SSE_LANE_WIDTH] = [0.0; SSE_LANE_WIDTH];
        // load
        for lane in 0..SSE_LANE_WIDTH {
          const INV255: f32 = 1.0 / 255.0;
          let lane_i = lane as isize;
          if x + lane < clip_width {
            let src_pixel = src_row_mid_ptr.offset(lane_i);
            src_r[lane] = square((*src_pixel & 0xFF) as f32 * INV255);
            src_g[lane] = square((*src_pixel >> 8 & 0xFF) as f32 * INV255);
            src_b[lane] = square((*src_pixel >> 16 & 0xFF) as f32 * INV255);
            src_a[lane] = (*src_pixel >> 24 & 0xFF) as f32 * INV255;
            //
            let dest_pixel = dest_row_mid_ptr.offset(lane_i);
            dest_r[lane] = square((*dest_pixel & 0xFF) as f32 * INV255);
            dest_g[lane] = square((*dest_pixel >> 8 & 0xFF) as f32 * INV255);
            dest_b[lane] = square((*dest_pixel >> 16 & 0xFF) as f32 * INV255);
            dest_a[lane] = (*dest_pixel >> 24 & 0xFF) as f32 * INV255;
          } else {
            break;
          }
        }
        // work
        for lane in 0..SSE_LANE_WIDTH {
          let toward = src_a[lane];
          let one_minus_toward = 1.0 - toward;
          out_r[lane] = one_minus_toward * dest_r[lane] + toward * src_r[lane];
          out_g[lane] = one_minus_toward * dest_g[lane] + toward * src_g[lane];
          out_b[lane] = one_minus_toward * dest_b[lane] + toward * src_b[lane];
          out_a[lane] = one_minus_toward * dest_a[lane] + toward * src_a[lane];
        }
        // store results
        for lane in 0..SSE_LANE_WIDTH {
          let lane_i = lane as isize;
          if x + lane < clip_width {
            let out32 = (((out_a[lane] * 255.0 + 0.5) as u32) << 24) | (((sqrt(out_b[lane]) * 255.0 + 0.5) as u32) << 16)
              | (((sqrt(out_g[lane]) * 255.0 + 0.5) as u32) << 8) | ((sqrt(out_r[lane]) * 255.0 + 0.5) as u32);
            *dest_row_mid_ptr.offset(lane_i) = out32;
          } else {
            break;
          }
        }
        x += SSE_LANE_WIDTH;
        src_row_mid_ptr = src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
        dest_row_mid_ptr = dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
      }
      y += 1;
      src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
      dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
    }
  }
}

//
//
// SSE2
//
//

// TODO: doc-tests, docs
#[allow(unused_macros)]
macro_rules! m128_f32 {
  ($a:ident, $i:expr) => {
    *(((&mut $a) as *mut __m128) as *mut f32).offset($i as isize)
  };
}

// TODO: doc-tests, docs
#[allow(unused_macros)]
macro_rules! m128i_i32 {
  ($a:ident, $i:expr) => {
    *(((&mut $a) as *mut __m128i) as *mut i32).offset($i as isize)
  };
}

// TODO: doc-tests, docs
#[allow(unused_macros)]
macro_rules! mm_square {
  ($reg:expr) => {
    _mm_mul_ps($reg, $reg)
  };
}

// TODO: doc-tests, docs
#[allow(unused_macros)]
macro_rules! print_128 {
  ($reg:ident,i) => {{
    let arr = (&$reg as *const __m128i as *const [u32; 4]).as_ref().unwrap();
    println!("{}: i[{:08X},{:08X},{:08X},{:08X}]", stringify!($reg), arr[0], arr[1], arr[2], arr[3]);
  }};
  ($reg:ident,f) => {{
    let arr = (&$reg as *const __m128 as *const [f32; 4]).as_ref().unwrap();
    println!("{}: f[{:7.1},{:7.1},{:7.1},{:7.1}]", stringify!($reg), arr[0], arr[1], arr[2], arr[3]);
  }};
}

// TODO: doc-tests, docs
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
macro_rules! sse2_do_pixel_work {
  ($src_pixel_x4:ident, $dest_pixel_x4:ident) => {{
    let twofivefive_4x = _mm_set1_ps(255.0);
    let inverse255_4x = _mm_set1_ps(1.0 / 255.0);
    let one_4x = _mm_set1_ps(1.0);
    let ff_4x = _mm_set1_epi32(0xFF);

    // unpack into channels
    let src_r = _mm_cvtepi32_ps(_mm_and_si128($src_pixel_x4, ff_4x));
    let src_g = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 8), ff_4x));
    let src_b = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 16), ff_4x));
    let src_a = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($src_pixel_x4, 24), ff_4x));

    let dest_r = _mm_cvtepi32_ps(_mm_and_si128($dest_pixel_x4, ff_4x));
    let dest_g = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 8), ff_4x));
    let dest_b = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 16), ff_4x));
    let dest_a = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32($dest_pixel_x4, 24), ff_4x));

    // do work in SIMD
    let src_linear_r = mm_square!(_mm_mul_ps(src_r, inverse255_4x));
    let src_linear_g = mm_square!(_mm_mul_ps(src_g, inverse255_4x));
    let src_linear_b = mm_square!(_mm_mul_ps(src_b, inverse255_4x));
    let src_linear_a = _mm_mul_ps(src_a, inverse255_4x); // do not square alpha

    let dest_linear_r = mm_square!(_mm_mul_ps(dest_r, inverse255_4x));
    let dest_linear_g = mm_square!(_mm_mul_ps(dest_g, inverse255_4x));
    let dest_linear_b = mm_square!(_mm_mul_ps(dest_b, inverse255_4x));
    let dest_linear_a = _mm_mul_ps(dest_a, inverse255_4x); // do not square alpha

    let toward = src_linear_a;
    let one_minus_toward = _mm_sub_ps(one_4x, toward);

    let out_r = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_r), _mm_mul_ps(toward, src_linear_r));
    let out_g = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_g), _mm_mul_ps(toward, src_linear_g));
    let out_b = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_b), _mm_mul_ps(toward, src_linear_b));
    let out_a = _mm_add_ps(_mm_mul_ps(one_minus_toward, dest_linear_a), _mm_mul_ps(toward, src_linear_a));

    // remove linear status
    let out_r_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_r), twofivefive_4x));
    let out_g_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_g), twofivefive_4x));
    let out_b_i32 = _mm_cvtps_epi32(_mm_mul_ps(_mm_sqrt_ps(out_b), twofivefive_4x));
    let out_a_i32 = _mm_cvtps_epi32(_mm_mul_ps(out_a, twofivefive_4x));

    // pack up the results
    let out_xxgr_i32 = _mm_or_si128(_mm_slli_epi32(out_g_i32, 8), out_r_i32);
    let out_abxx_i32 = _mm_or_si128(_mm_slli_epi32(out_a_i32, 24), _mm_slli_epi32(out_b_i32, 16));
    _mm_or_si128(out_abxx_i32, out_xxgr_i32)
  }};
}

// TODO: doc-tests, docs
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
macro_rules! sse2_finish_off_row_aligned {
  ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
    debug_assert_eq!(
      check_misalign16!($src_row_mid_ptr),
      0,
      "sse2_finish_off_row_aligned, the src_row_mid_ptr isn't aligned: {} / {}",
      check_misalign16!($src_row_mid_ptr),
      $src_row_mid_ptr as usize
    );
    debug_assert_eq!(
      check_misalign16!($dest_row_mid_ptr),
      0,
      "sse2_finish_off_row_aligned, the dest_row_mid_ptr isn't aligned: {} / {}",
      check_misalign16!($dest_row_mid_ptr),
      $dest_row_mid_ptr as usize
    );
    if check_misalign4!($clip_width as isize - $x as isize) == 0 {
      // we'll _always_ be working 4 lanes at a time for the rest of this
      // row, maximum speed!
      while $x < $clip_width {
        let src_pixel_x4 = _mm_load_si128($src_row_mid_ptr as *const __m128i);
        let dest_pixel_x4 = _mm_load_si128($dest_row_mid_ptr as *const __m128i);
        let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
        _mm_store_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
        $x += SSE_LANE_WIDTH;
        $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
      }
    } else {
      // we'll have to do less than 4 lanes for the final pass of this
      // row. Still, we can at least do aligned loads for all the
      // "complete" passes.
      while $x < $clip_width {
        match $clip_width as isize - $x as isize {
          1 => {
            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, 0, 0, 0);
            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, 0, 0, 0);
            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
          }
          2 => {
            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), 0, 0);
            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), 0, 0);
            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
            *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
          }
          3 => {
            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), *$src_row_mid_ptr.offset(2), 0);
            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), *$dest_row_mid_ptr.offset(2), 0);
            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
            *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
            *$dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
          }
          other => {
            debug_assert!(other >= 4);
            let src_pixel_x4 = _mm_load_si128($src_row_mid_ptr as *const __m128i);
            let dest_pixel_x4 = _mm_load_si128($dest_row_mid_ptr as *const __m128i);
            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
            _mm_store_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
          }
        }
        $x += SSE_LANE_WIDTH;
        $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
      }
    }
  }};
}

// TODO: doc-tests, docs
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
macro_rules! sse2_finish_off_row_un_aligned {
  ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
    if check_misalign4!($clip_width as isize - $x as isize) == 0 {
      // we'll _always_ be working 4 lanes at a time for the rest of this
      // row, maximum speed!
      while $x < $clip_width {
        let src_pixel_x4 = _mm_loadu_si128($src_row_mid_ptr as *const __m128i);
        let dest_pixel_x4 = _mm_loadu_si128($dest_row_mid_ptr as *const __m128i);
        let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
        _mm_storeu_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
        $x += SSE_LANE_WIDTH;
        $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
      }
    } else {
      // we'll have to do less than 4 lanes for the final pass of this
      // row. Still, we can at least do aligned loads for all the
      // "complete" passes.
      while $x < $clip_width {
        match $clip_width as isize - $x as isize {
          1 => {
            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, 0, 0, 0);
            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, 0, 0, 0);
            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
          }
          2 => {
            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), 0, 0);
            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), 0, 0);
            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
            *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
          }
          3 => {
            let src_pixel_x4 = _mm_setr_epi32(*$src_row_mid_ptr, *$src_row_mid_ptr.offset(1), *$src_row_mid_ptr.offset(2), 0);
            let dest_pixel_x4 = _mm_setr_epi32(*$dest_row_mid_ptr, *$dest_row_mid_ptr.offset(1), *$dest_row_mid_ptr.offset(2), 0);
            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
            *$dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
            *$dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
            *$dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
          }
          other => {
            debug_assert!(other >= 4);
            let src_pixel_x4 = _mm_loadu_si128($src_row_mid_ptr as *const __m128i);
            let dest_pixel_x4 = _mm_loadu_si128($dest_row_mid_ptr as *const __m128i);
            let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
            _mm_storeu_si128($dest_row_mid_ptr as *mut __m128i, out_packed_x4);
          }
        }
        $x += SSE_LANE_WIDTH;
        $src_row_mid_ptr = $src_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(SSE_LANE_WIDTH_I);
      }
    }
  }};
}

/// Calls the SSE2 enabled version of the `blit_rectilinear` operation. That
/// function doesn't have "sse2" enabled, but this one does, and so when that
/// one gets inlined to here it'll get inlined into an "sse2" form. Sounds
/// silly, but that's the suggested way in the std simd docs.
#[target_feature(enable = "sse2")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn blit_blend_rectilinear_sse2_explicit<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
where
  WI: WritableImage<u32> + ?Sized,
  RI: ReadableImage<u32>,
{
  let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
    determine_overlay!(dest, src, offset);

  if clip_width > 0 && clip_height > 0 {
    let src_pitch = src.pitch();
    let dest_pitch = dest.pitch();
    let mut y = 0;
    while y < clip_height {
      let mut x = 0;
      let mut src_row_mid_ptr = src_row_start_ptr as *const i32;
      let mut dest_row_mid_ptr = dest_row_start_ptr as *mut i32;
      let src_misalign = check_misalign16!(src_row_mid_ptr);
      let dest_misalign = check_misalign16!(dest_row_mid_ptr);
      if src_misalign > 0 || dest_misalign > 0 {
        // we're somehow off alignment.
        if src_misalign == dest_misalign {
          // both are mis-aligned, but in phase with each other, so we start by
          // stepping forward 1-3 pixels so that we're at an aligned point.
          match src_misalign {
            12 => {
              // jump forward 1 pixel
              let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, 0, 0, 0);
              let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, 0, 0, 0);
              let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
              *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
              src_row_mid_ptr = src_row_mid_ptr.offset(1);
              dest_row_mid_ptr = dest_row_mid_ptr.offset(1);
              x += 1;
            }
            8 => {
              // jump forward 2 pixels
              let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, *src_row_mid_ptr.offset(1), 0, 0);
              let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, *dest_row_mid_ptr.offset(1), 0, 0);
              let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
              *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
              *dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
              src_row_mid_ptr = src_row_mid_ptr.offset(2);
              dest_row_mid_ptr = dest_row_mid_ptr.offset(2);
              x += 2;
            }
            4 => {
              // jump forward 3 pixels
              let src_pixel_x4 = _mm_setr_epi32(*src_row_mid_ptr, *src_row_mid_ptr.offset(1), *src_row_mid_ptr.offset(2), 0);
              let dest_pixel_x4 = _mm_setr_epi32(*dest_row_mid_ptr, *dest_row_mid_ptr.offset(1), *dest_row_mid_ptr.offset(2), 0);
              let out_packed_x4 = sse2_do_pixel_work!(src_pixel_x4, dest_pixel_x4);
              *dest_row_mid_ptr = *(&out_packed_x4 as *const __m128i as *const i32);
              *dest_row_mid_ptr.offset(1) = *(&out_packed_x4 as *const __m128i as *const i32).offset(1);
              *dest_row_mid_ptr.offset(2) = *(&out_packed_x4 as *const __m128i as *const i32).offset(2);
              src_row_mid_ptr = src_row_mid_ptr.offset(3);
              dest_row_mid_ptr = dest_row_mid_ptr.offset(3);
              x += 3;
            }
            other => panic!("invalid src_misalign value: {}", other),
          };
          // Now we finish off the rest of the row fully aligned.
          sse2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
        } else {
          // either of them is mis-aligned, but they're out of phase with each
          // other, so we have no hope to get them both aligned during this row.
          sse2_finish_off_row_un_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
        }
      } else {
        // both pointers are totally aligned without doing a startup set.
        sse2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
      }
      y += 1;
      src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
      dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
    }
  }
}

//
//
// AVX
//
//

// TODO: doc-tests, docs
macro_rules! m256_f32 {
  ($a:ident, $i:expr) => {
    *(((&mut $a) as *mut __m256) as *mut f32).offset($i as isize)
  };
}

// TODO: doc-tests, docs
macro_rules! m256i_i32 {
  ($a:ident, $i:expr) => {
    *(((&mut $a) as *mut __m256i) as *mut i32).offset($i as isize)
  };
}

// TODO: doc-tests, docs
macro_rules! mm256_square {
  ($reg:expr) => {
    _mm256_mul_ps($reg, $reg)
  };
}

// TODO: doc-tests, docs
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
macro_rules! avx2_do_pixel_work {
  ($src_pixel_x8:ident, $dest_pixel_x8:ident) => {{
    // Convert from sRGB packed u32 to linear 0.0 to 1.0
    let twofivefive_8x = _mm256_set1_ps(255.0);
    let inverse255_8x = _mm256_set1_ps(1.0 / 255.0);
    let one_8x = _mm256_set1_ps(1.0);
    let ff_8x = _mm256_set1_epi32(0xFF);

    // unpack into channels
    let src_r = _mm256_cvtepi32_ps(_mm256_and_si256($src_pixel_x8, ff_8x));
    let src_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 8), ff_8x));
    let src_b = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 16), ff_8x));
    let src_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($src_pixel_x8, 24), ff_8x));

    let dest_r = _mm256_cvtepi32_ps(_mm256_and_si256($dest_pixel_x8, ff_8x));
    let dest_g = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 8), ff_8x));
    let dest_b = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 16), ff_8x));
    let dest_a = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32($dest_pixel_x8, 24), ff_8x));

    // do work in SIMD
    let src_linear_r = mm256_square!(_mm256_mul_ps(src_r, inverse255_8x));
    let src_linear_g = mm256_square!(_mm256_mul_ps(src_g, inverse255_8x));
    let src_linear_b = mm256_square!(_mm256_mul_ps(src_b, inverse255_8x));
    let src_linear_a = _mm256_mul_ps(src_a, inverse255_8x); // do not square alpha

    let dest_linear_r = mm256_square!(_mm256_mul_ps(dest_r, inverse255_8x));
    let dest_linear_g = mm256_square!(_mm256_mul_ps(dest_g, inverse255_8x));
    let dest_linear_b = mm256_square!(_mm256_mul_ps(dest_b, inverse255_8x));
    let dest_linear_a = _mm256_mul_ps(dest_a, inverse255_8x); // do not square alpha

    let toward = src_linear_a;
    let one_minus_toward = _mm256_sub_ps(one_8x, toward);

    let out_r = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_r), _mm256_mul_ps(toward, src_linear_r));
    let out_g = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_g), _mm256_mul_ps(toward, src_linear_g));
    let out_b = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_b), _mm256_mul_ps(toward, src_linear_b));
    let out_a = _mm256_add_ps(_mm256_mul_ps(one_minus_toward, dest_linear_a), _mm256_mul_ps(toward, src_linear_a));

    // remove linear status
    let out_r_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_r), twofivefive_8x));
    let out_g_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_g), twofivefive_8x));
    let out_b_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sqrt_ps(out_b), twofivefive_8x));
    let out_a_i32 = _mm256_cvtps_epi32(_mm256_mul_ps(out_a, twofivefive_8x));

    // pack up the results
    let out_xxgr_i32 = _mm256_or_si256(_mm256_slli_epi32(out_g_i32, 8), out_r_i32);
    let out_abxx_i32 = _mm256_or_si256(_mm256_slli_epi32(out_a_i32, 24), _mm256_slli_epi32(out_b_i32, 16));
    _mm256_or_si256(out_abxx_i32, out_xxgr_i32)
  }};
}

// TODO: doc-tests, docs
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
macro_rules! avx2_finish_off_row_aligned {
  ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
    debug_assert_eq!(
      check_misalign32!($src_row_mid_ptr),
      0,
      "avx2_finish_off_row_aligned, the src_row_mid_ptr isn't aligned: {} / {}",
      check_misalign32!($src_row_mid_ptr),
      $src_row_mid_ptr as usize
    );
    debug_assert_eq!(
      check_misalign32!($dest_row_mid_ptr),
      0,
      "avx2_finish_off_row_aligned, the dest_row_mid_ptr isn't aligned: {} / {}",
      check_misalign32!($dest_row_mid_ptr),
      $dest_row_mid_ptr as usize
    );
    if check_misalign8!($clip_width as isize - $x as isize) == 0 {
      // we'll _always_ be working 8 lanes at a time for the rest of this
      // row, maximum speed!
      while $x < $clip_width {
        let src_pixel_x8 = _mm256_load_si256($src_row_mid_ptr as *const __m256i);
        let dest_pixel_x8 = _mm256_load_si256($dest_row_mid_ptr as *const __m256i);
        let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
        _mm256_store_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
        $x += AVX_LANE_WIDTH;
        $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
      }
    } else {
      // we'll have to do less than 4 lanes for the final pass of this
      // row. Still, we can at least do aligned loads for all the
      // "complete" passes.
      while $x < $clip_width {
        match $clip_width as isize - $x as isize {
          1 => {
            let read_write_mask = _mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          2 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          3 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          4 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          5 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          6 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          7 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          other => {
            debug_assert!(other >= 8);
            let src_pixel_x8 = _mm256_load_si256($src_row_mid_ptr as *const __m256i);
            let dest_pixel_x8 = _mm256_load_si256($dest_row_mid_ptr as *const __m256i);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_store_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
          }
        }
        $x += AVX_LANE_WIDTH;
        $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
      }
    }
  }};
}

// TODO: doc-tests, docs
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
macro_rules! avx2_finish_off_row_un_aligned {
  ($x:ident, $clip_width:ident, $src_row_mid_ptr:expr, $dest_row_mid_ptr:expr) => {{
    if check_misalign8!($clip_width as isize - $x as isize) == 0 {
      // we'll _always_ be working 8 lanes at a time for the rest of this
      // row, maximum speed!
      while $x < $clip_width {
        let src_pixel_x8 = _mm256_loadu_si256($src_row_mid_ptr as *const __m256i);
        let dest_pixel_x8 = _mm256_loadu_si256($dest_row_mid_ptr as *const __m256i);
        let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
        _mm256_storeu_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
        $x += AVX_LANE_WIDTH;
        $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
      }
    } else {
      // we'll have to do less than 4 lanes for the final pass of this
      // row. Still, we can at least do aligned loads for all the
      // "complete" passes.
      while $x < $clip_width {
        match $clip_width as isize - $x as isize {
          1 => {
            let read_write_mask = _mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          2 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          3 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          4 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          5 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          6 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          7 => {
            let read_write_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0);
            let src_pixel_x8 = _mm256_maskload_epi32($src_row_mid_ptr, read_write_mask);
            let dest_pixel_x8 = _mm256_maskload_epi32($dest_row_mid_ptr, read_write_mask);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_maskstore_epi32($dest_row_mid_ptr, read_write_mask, out_packed_x8);
          }
          other => {
            debug_assert!(other >= 8);
            let src_pixel_x8 = _mm256_loadu_si256($src_row_mid_ptr as *const __m256i);
            let dest_pixel_x8 = _mm256_loadu_si256($dest_row_mid_ptr as *const __m256i);
            let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
            _mm256_storeu_si256($dest_row_mid_ptr as *mut __m256i, out_packed_x8);
          }
        }
        $x += AVX_LANE_WIDTH;
        $src_row_mid_ptr = $src_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
        $dest_row_mid_ptr = $dest_row_mid_ptr.offset(AVX_LANE_WIDTH_I);
      }
    }
  }};
}

// TODO: doc-tests, docs
#[target_feature(enable = "avx2")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn blit_blend_rectilinear_avx2_explicit<WI, RI>(dest: &mut WI, src: &RI, offset: (isize, isize))
where
  WI: WritableImage<u32> + ?Sized,
  RI: ReadableImage<u32>,
{
  let (clip_width, clip_height, mut src_row_start_ptr, mut dest_row_start_ptr): (usize, usize, *const u32, *mut u32) =
    determine_overlay!(dest, src, offset);

  if clip_width > 0 && clip_height > 0 {
    let src_pitch = src.pitch();
    let dest_pitch = dest.pitch();
    let mut y = 0;
    while y < clip_height {
      let mut x = 0;
      let mut src_row_mid_ptr = src_row_start_ptr as *const i32;
      let mut dest_row_mid_ptr = dest_row_start_ptr as *mut i32;
      let src_misalign = check_misalign32!(src_row_mid_ptr);
      let dest_misalign = check_misalign32!(dest_row_mid_ptr);
      if src_misalign > 0 || dest_misalign > 0 {
        // we're somehow off alignment.
        if src_misalign == dest_misalign {
          // both are mis-aligned, but in phase with each other. We'll process a
          // few pixels so that we can have both be aligned.
          let (read_write_mask, pixel_jump) = match src_misalign {
            28 => (_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0), 1),
            24 => (_mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0), 2),
            20 => (_mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0), 3),
            16 => (_mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0), 4),
            12 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0), 5),
            8 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0), 6),
            4 => (_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0), 7),
            other => panic!("invalid src_misalign value: {}", other),
          };
          let src_pixel_x8 = _mm256_maskload_epi32(src_row_mid_ptr, read_write_mask);
          let dest_pixel_x8 = _mm256_maskload_epi32(dest_row_mid_ptr, read_write_mask);
          let out_packed_x8 = avx2_do_pixel_work!(src_pixel_x8, dest_pixel_x8);
          _mm256_maskstore_epi32(dest_row_mid_ptr, read_write_mask, out_packed_x8);
          src_row_mid_ptr = src_row_mid_ptr.offset(pixel_jump);
          dest_row_mid_ptr = dest_row_mid_ptr.offset(pixel_jump);
          x += pixel_jump as usize;
          // Now we finish off the rest of the row fully aligned.
          avx2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
        } else {
          // either of them is mis-aligned, but they're out of phase with each
          // other, so we have no hope to get them both aligned during this row.
          avx2_finish_off_row_un_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
        }
      } else {
        // both pointers are totally aligned without doing a startup set.
        avx2_finish_off_row_aligned!(x, clip_width, src_row_mid_ptr, dest_row_mid_ptr);
      }
      y += 1;
      src_row_start_ptr = src_row_start_ptr.offset(src_pitch);
      dest_row_start_ptr = dest_row_start_ptr.offset(dest_pitch);
    }
  }
}