zoomvtools 2.0.0

#![allow(clippy::undocumented_unsafe_blocks)]
#![allow(unsafe_op_in_unsafe_fn)]

#[cfg(test)]
use std::mem::size_of;
use std::{arch::x86_64::*, num::NonZeroUsize};

#[cfg(test)]
use crate::util::Pixel;
use cpudetect::target_family;
use semisafe::slice::get_mut as semisafe_get_mut;

#[cfg(test)]
#[must_use]
#[target_family("x86_64_v3")]
pub(super) unsafe fn get_sad<T: Pixel>(
    width: NonZeroUsize,
    height: NonZeroUsize,
    src: &[T],
    src_pitch: NonZeroUsize,
    ref_: &[T],
    ref_pitch: NonZeroUsize,
) -> u64 {
    match (size_of::<T>(), width.get(), height.get()) {
        (1, 2, 2) => get_sad_u8::<2, 2>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 2, 4) => get_sad_u8::<2, 4>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 4, 2) => get_sad_u8::<4, 2>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 4, 4) => get_sad_u8::<4, 4>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 4, 8) => get_sad_u8::<4, 8>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 8, 1) => get_sad_u8::<8, 1>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 8, 2) => get_sad_u8::<8, 2>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 8, 4) => get_sad_u8::<8, 4>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 8, 8) => get_sad_u8::<8, 8>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 8, 16) => get_sad_u8::<8, 16>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 16, 1) => get_sad_u8::<16, 1>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 16, 2) => get_sad_u8::<16, 2>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 16, 4) => get_sad_u8::<16, 4>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 16, 8) => get_sad_u8::<16, 8>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 16, 16) => get_sad_u8::<16, 16>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 16, 32) => get_sad_u8::<16, 32>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 32, 8) => get_sad_u8::<32, 8>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 32, 16) => get_sad_u8::<32, 16>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 32, 32) => get_sad_u8::<32, 32>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 32, 64) => get_sad_u8::<32, 64>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 64, 16) => get_sad_u8::<64, 16>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 64, 32) => get_sad_u8::<64, 32>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 64, 64) => get_sad_u8::<64, 64>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 64, 128) => get_sad_u8::<64, 128>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 128, 32) => get_sad_u8::<128, 32>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 128, 64) => get_sad_u8::<128, 64>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (1, 128, 128) => get_sad_u8::<128, 128>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 2, 2) => get_sad_u16::<2, 2>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 2, 4) => get_sad_u16::<2, 4>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 4, 2) => get_sad_u16::<4, 2>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 4, 4) => get_sad_u16::<4, 4>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 4, 8) => get_sad_u16::<4, 8>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 8, 1) => get_sad_u16::<8, 1>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 8, 2) => get_sad_u16::<8, 2>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 8, 4) => get_sad_u16::<8, 4>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 8, 8) => get_sad_u16::<8, 8>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 8, 16) => get_sad_u16::<8, 16>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 16, 1) => get_sad_u16::<16, 1>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 16, 2) => get_sad_u16::<16, 2>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 16, 4) => get_sad_u16::<16, 4>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 16, 8) => get_sad_u16::<16, 8>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 16, 16) => get_sad_u16::<16, 16>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 16, 32) => get_sad_u16::<16, 32>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 32, 8) => get_sad_u16::<32, 8>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 32, 16) => get_sad_u16::<32, 16>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 32, 32) => get_sad_u16::<32, 32>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 32, 64) => get_sad_u16::<32, 64>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 64, 16) => get_sad_u16::<64, 16>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 64, 32) => get_sad_u16::<64, 32>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 64, 64) => get_sad_u16::<64, 64>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 64, 128) => get_sad_u16::<64, 128>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 128, 32) => get_sad_u16::<128, 32>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 128, 64) => get_sad_u16::<128, 64>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        (2, 128, 128) => get_sad_u16::<128, 128>(
            src.as_ptr().cast(),
            src_pitch,
            ref_.as_ptr().cast(),
            ref_pitch,
        ),
        _ => unreachable!("unsupported block size"),
    }
}

#[target_family("x86_64_v3")]
unsafe fn combine_m128i(lo: __m128i, hi: __m128i) -> __m256i {
    _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1)
}

#[target_family("x86_64_v3")]
unsafe fn pack_u8_rows<const WIDTH: usize>(ptr: *const u8, pitch: usize, rows: usize) -> __m256i {
    debug_assert!(rows > 0);
    debug_assert!(rows <= 32 / WIDTH);

    match WIDTH {
        16 => {
            let row0 = _mm_loadu_si128(ptr.cast::<__m128i>());
            let row1 = if rows > 1 {
                _mm_loadu_si128(ptr.add(pitch).cast::<__m128i>())
            } else {
                _mm_setzero_si128()
            };
            combine_m128i(row0, row1)
        }
        8 => {
            let zero = _mm_setzero_si128();
            let row0 = _mm_loadl_epi64(ptr.cast::<__m128i>());
            let row1 = if rows > 1 {
                _mm_loadl_epi64(ptr.add(pitch).cast::<__m128i>())
            } else {
                zero
            };
            let row2 = if rows > 2 {
                _mm_loadl_epi64(ptr.add(pitch * 2).cast::<__m128i>())
            } else {
                zero
            };
            let row3 = if rows > 3 {
                _mm_loadl_epi64(ptr.add(pitch * 3).cast::<__m128i>())
            } else {
                zero
            };
            combine_m128i(
                _mm_unpacklo_epi64(row0, row1),
                _mm_unpacklo_epi64(row2, row3),
            )
        }
        4 => {
            let row0 = (ptr as *const u32).read_unaligned() as i32;
            let row1 = if rows > 1 {
                (ptr.add(pitch) as *const u32).read_unaligned() as i32
            } else {
                0
            };
            let row2 = if rows > 2 {
                (ptr.add(pitch * 2) as *const u32).read_unaligned() as i32
            } else {
                0
            };
            let row3 = if rows > 3 {
                (ptr.add(pitch * 3) as *const u32).read_unaligned() as i32
            } else {
                0
            };
            let row4 = if rows > 4 {
                (ptr.add(pitch * 4) as *const u32).read_unaligned() as i32
            } else {
                0
            };
            let row5 = if rows > 5 {
                (ptr.add(pitch * 5) as *const u32).read_unaligned() as i32
            } else {
                0
            };
            let row6 = if rows > 6 {
                (ptr.add(pitch * 6) as *const u32).read_unaligned() as i32
            } else {
                0
            };
            let row7 = if rows > 7 {
                (ptr.add(pitch * 7) as *const u32).read_unaligned() as i32
            } else {
                0
            };
            combine_m128i(
                _mm_setr_epi32(row0, row1, row2, row3),
                _mm_setr_epi32(row4, row5, row6, row7),
            )
        }
        2 => {
            // Zero-fill unused rows so partial groups do not affect the total.
            let mut packed = [0u16; 16];
            for row in 0..rows {
                *semisafe_get_mut(&mut packed, row) =
                    (ptr.add(row * pitch) as *const u16).read_unaligned();
            }
            _mm256_loadu_si256(packed.as_ptr().cast::<__m256i>())
        }
        _ => unreachable!(),
    }
}

#[target_family("x86_64_v3")]
unsafe fn pack_u16_rows<const WIDTH: usize>(ptr: *const u16, pitch: usize, rows: usize) -> __m256i {
    debug_assert!(rows > 0);
    debug_assert!(rows <= 16 / WIDTH);

    match WIDTH {
        4 => {
            let zero = _mm_setzero_si128();
            let row0 = _mm_loadl_epi64(ptr.cast::<__m128i>());
            let row1 = if rows > 1 {
                _mm_loadl_epi64(ptr.add(pitch).cast::<__m128i>())
            } else {
                zero
            };
            let row2 = if rows > 2 {
                _mm_loadl_epi64(ptr.add(pitch * 2).cast::<__m128i>())
            } else {
                zero
            };
            let row3 = if rows > 3 {
                _mm_loadl_epi64(ptr.add(pitch * 3).cast::<__m128i>())
            } else {
                zero
            };
            combine_m128i(
                _mm_unpacklo_epi64(row0, row1),
                _mm_unpacklo_epi64(row2, row3),
            )
        }
        2 => {
            // Zero-fill unused rows so partial groups do not affect the total.
            let load = |row: usize| -> i32 {
                if row < rows {
                    (ptr.add(row * pitch) as *const u32).read_unaligned() as i32
                } else {
                    0
                }
            };
            _mm256_setr_epi32(
                load(0),
                load(1),
                load(2),
                load(3),
                load(4),
                load(5),
                load(6),
                load(7),
            )
        }
        _ => unreachable!(),
    }
}

/// Block-size-specific SAD for u8 pixels.
/// Made pub(crate) to allow direct access for pre-selected function pointers.
#[must_use]
#[target_family("x86_64_v3")]
// NOTE: Ported from C implementation
pub(crate) unsafe fn get_sad_u8<const WIDTH: usize, const HEIGHT: usize>(
    src: *const u8,
    src_pitch: NonZeroUsize,
    ref_: *const u8,
    ref_pitch: NonZeroUsize,
) -> u64 {
    let src_pitch = src_pitch.get();
    let ref_pitch = ref_pitch.get();

    let sum = if WIDTH >= 32 {
        if WIDTH == 128 {
            let mut acc0 = _mm256_setzero_si256();
            let mut acc1 = _mm256_setzero_si256();
            let mut acc2 = _mm256_setzero_si256();
            let mut acc3 = _mm256_setzero_si256();

            for j in 0..HEIGHT {
                let src_row = src.add(j * src_pitch);
                let ref_row = ref_.add(j * ref_pitch);

                acc0 = _mm256_add_epi64(
                    acc0,
                    _mm256_sad_epu8(
                        _mm256_loadu_si256(src_row.cast::<__m256i>()),
                        _mm256_loadu_si256(ref_row.cast::<__m256i>()),
                    ),
                );
                acc1 = _mm256_add_epi64(
                    acc1,
                    _mm256_sad_epu8(
                        _mm256_loadu_si256(src_row.add(32).cast::<__m256i>()),
                        _mm256_loadu_si256(ref_row.add(32).cast::<__m256i>()),
                    ),
                );
                acc2 = _mm256_add_epi64(
                    acc2,
                    _mm256_sad_epu8(
                        _mm256_loadu_si256(src_row.add(64).cast::<__m256i>()),
                        _mm256_loadu_si256(ref_row.add(64).cast::<__m256i>()),
                    ),
                );
                acc3 = _mm256_add_epi64(
                    acc3,
                    _mm256_sad_epu8(
                        _mm256_loadu_si256(src_row.add(96).cast::<__m256i>()),
                        _mm256_loadu_si256(ref_row.add(96).cast::<__m256i>()),
                    ),
                );
            }

            let acc256 =
                _mm256_add_epi64(_mm256_add_epi64(acc0, acc1), _mm256_add_epi64(acc2, acc3));
            let acc256_lo = _mm256_castsi256_si128(acc256);
            let acc256_hi = _mm256_extracti128_si256(acc256, 1);
            _mm_add_epi64(acc256_lo, acc256_hi)
        } else if WIDTH == 64 {
            let mut acc0 = _mm256_setzero_si256();
            let mut acc1 = _mm256_setzero_si256();

            for j in 0..HEIGHT {
                let src_row = src.add(j * src_pitch);
                let ref_row = ref_.add(j * ref_pitch);

                acc0 = _mm256_add_epi64(
                    acc0,
                    _mm256_sad_epu8(
                        _mm256_loadu_si256(src_row.cast::<__m256i>()),
                        _mm256_loadu_si256(ref_row.cast::<__m256i>()),
                    ),
                );
                acc1 = _mm256_add_epi64(
                    acc1,
                    _mm256_sad_epu8(
                        _mm256_loadu_si256(src_row.add(32).cast::<__m256i>()),
                        _mm256_loadu_si256(ref_row.add(32).cast::<__m256i>()),
                    ),
                );
            }

            let acc256 = _mm256_add_epi64(acc0, acc1);
            let acc256_lo = _mm256_castsi256_si128(acc256);
            let acc256_hi = _mm256_extracti128_si256(acc256, 1);
            _mm_add_epi64(acc256_lo, acc256_hi)
        } else {
            let mut acc0 = _mm256_setzero_si256();
            let mut acc1 = _mm256_setzero_si256();

            for j in 0..HEIGHT {
                let src_row = src.add(j * src_pitch);
                let ref_row = ref_.add(j * ref_pitch);
                let sad = _mm256_sad_epu8(
                    _mm256_loadu_si256(src_row.cast::<__m256i>()),
                    _mm256_loadu_si256(ref_row.cast::<__m256i>()),
                );

                if j % 2 == 0 {
                    acc0 = _mm256_add_epi64(acc0, sad);
                } else {
                    acc1 = _mm256_add_epi64(acc1, sad);
                }
            }

            let acc256 = _mm256_add_epi64(acc0, acc1);
            let acc256_lo = _mm256_castsi256_si128(acc256);
            let acc256_hi = _mm256_extracti128_si256(acc256, 1);
            _mm_add_epi64(acc256_lo, acc256_hi)
        }
    } else {
        let rows_per_vec = 32 / WIDTH;
        let full_rows = HEIGHT / rows_per_vec * rows_per_vec;
        let mut acc256 = _mm256_setzero_si256();

        for j in (0..full_rows).step_by(rows_per_vec) {
            let s = pack_u8_rows::<WIDTH>(src.add(j * src_pitch), src_pitch, rows_per_vec);
            let r = pack_u8_rows::<WIDTH>(ref_.add(j * ref_pitch), ref_pitch, rows_per_vec);
            acc256 = _mm256_add_epi64(acc256, _mm256_sad_epu8(s, r));
        }

        if full_rows != HEIGHT {
            let s = pack_u8_rows::<WIDTH>(
                src.add(full_rows * src_pitch),
                src_pitch,
                HEIGHT - full_rows,
            );
            let r = pack_u8_rows::<WIDTH>(
                ref_.add(full_rows * ref_pitch),
                ref_pitch,
                HEIGHT - full_rows,
            );
            acc256 = _mm256_add_epi64(acc256, _mm256_sad_epu8(s, r));
        }

        let acc256_lo = _mm256_castsi256_si128(acc256);
        let acc256_hi = _mm256_extracti128_si256(acc256, 1);
        _mm_add_epi64(acc256_lo, acc256_hi)
    };

    // Reduce 2x u64 lanes to scalar
    let high = _mm_unpackhi_epi64(sum, sum);
    let total = _mm_add_epi64(sum, high);
    _mm_cvtsi128_si64(total) as u64
}

/// Block-size-specific SAD for u16 pixels.
/// Made pub(crate) to allow direct access for pre-selected function pointers.
#[must_use]
#[target_family("x86_64_v3")]
// NOTE: Ported from C implementation
pub(crate) unsafe fn get_sad_u16<const WIDTH: usize, const HEIGHT: usize>(
    src: *const u8,
    src_pitch: NonZeroUsize,
    ref_: *const u8,
    ref_pitch: NonZeroUsize,
) -> u64 {
    let src: *const u16 = src.cast();
    let ref_: *const u16 = ref_.cast();
    let src_pitch = src_pitch.get();
    let ref_pitch = ref_pitch.get();

    let mut sum = if WIDTH >= 16 {
        let mut acc_lo = _mm256_setzero_si256();
        let mut acc_hi = _mm256_setzero_si256();
        let zero256 = _mm256_setzero_si256();

        for j in 0..HEIGHT {
            let src_row = src.add(j * src_pitch);
            let ref_row = ref_.add(j * ref_pitch);

            for i in (0..WIDTH).step_by(16) {
                let s = _mm256_loadu_si256(src_row.add(i) as *const __m256i);
                let r = _mm256_loadu_si256(ref_row.add(i) as *const __m256i);
                let abs_diff = _mm256_sub_epi16(_mm256_max_epu16(s, r), _mm256_min_epu16(s, r));
                acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(abs_diff, zero256));
                acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(abs_diff, zero256));
            }
        }

        let acc256 = _mm256_add_epi32(acc_lo, acc_hi);
        // Reduce acc256: add high and low 128-bit lanes
        let acc256_lo = _mm256_castsi256_si128(acc256);
        let acc256_hi = _mm256_extracti128_si256(acc256, 1);
        _mm_add_epi32(acc256_lo, acc256_hi)
    } else {
        match WIDTH {
            8 => {
                let mut acc_lo = _mm256_setzero_si256();
                let mut acc_hi = _mm256_setzero_si256();
                let zero256 = _mm256_setzero_si256();
                let full_rows = HEIGHT / 2 * 2;

                for j in (0..full_rows).step_by(2) {
                    let src0 = _mm_loadu_si128(src.add(j * src_pitch) as *const __m128i);
                    let src1 = _mm_loadu_si128(src.add((j + 1) * src_pitch) as *const __m128i);
                    let ref0 = _mm_loadu_si128(ref_.add(j * ref_pitch) as *const __m128i);
                    let ref1 = _mm_loadu_si128(ref_.add((j + 1) * ref_pitch) as *const __m128i);
                    let s = combine_m128i(src0, src1);
                    let r = combine_m128i(ref0, ref1);
                    let abs_diff = _mm256_sub_epi16(_mm256_max_epu16(s, r), _mm256_min_epu16(s, r));
                    acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(abs_diff, zero256));
                    acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(abs_diff, zero256));
                }

                let acc256 = _mm256_add_epi32(acc_lo, acc_hi);
                let acc256_lo = _mm256_castsi256_si128(acc256);
                let acc256_hi = _mm256_extracti128_si256(acc256, 1);
                let mut acc128 = _mm_add_epi32(acc256_lo, acc256_hi);

                if full_rows != HEIGHT {
                    let zero128 = _mm_setzero_si128();
                    let s = _mm_loadu_si128(src.add(full_rows * src_pitch) as *const __m128i);
                    let r = _mm_loadu_si128(ref_.add(full_rows * ref_pitch) as *const __m128i);
                    let abs_diff = _mm_sub_epi16(_mm_max_epu16(s, r), _mm_min_epu16(s, r));
                    acc128 = _mm_add_epi32(acc128, _mm_unpacklo_epi16(abs_diff, zero128));
                    acc128 = _mm_add_epi32(acc128, _mm_unpackhi_epi16(abs_diff, zero128));
                }

                acc128
            }
            4 | 2 => {
                let rows_per_vec = 16 / WIDTH;
                let full_rows = HEIGHT / rows_per_vec * rows_per_vec;
                let mut acc_lo = _mm256_setzero_si256();
                let mut acc_hi = _mm256_setzero_si256();
                let zero256 = _mm256_setzero_si256();

                for j in (0..full_rows).step_by(rows_per_vec) {
                    let s = pack_u16_rows::<WIDTH>(src.add(j * src_pitch), src_pitch, rows_per_vec);
                    let r =
                        pack_u16_rows::<WIDTH>(ref_.add(j * ref_pitch), ref_pitch, rows_per_vec);
                    let abs_diff = _mm256_sub_epi16(_mm256_max_epu16(s, r), _mm256_min_epu16(s, r));
                    acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(abs_diff, zero256));
                    acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(abs_diff, zero256));
                }

                if full_rows != HEIGHT {
                    let s = pack_u16_rows::<WIDTH>(
                        src.add(full_rows * src_pitch),
                        src_pitch,
                        HEIGHT - full_rows,
                    );
                    let r = pack_u16_rows::<WIDTH>(
                        ref_.add(full_rows * ref_pitch),
                        ref_pitch,
                        HEIGHT - full_rows,
                    );
                    let abs_diff = _mm256_sub_epi16(_mm256_max_epu16(s, r), _mm256_min_epu16(s, r));
                    acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(abs_diff, zero256));
                    acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(abs_diff, zero256));
                }

                let acc256 = _mm256_add_epi32(acc_lo, acc_hi);
                let acc256_lo = _mm256_castsi256_si128(acc256);
                let acc256_hi = _mm256_extracti128_si256(acc256, 1);
                _mm_add_epi32(acc256_lo, acc256_hi)
            }
            _ => unreachable!(),
        }
    };

    let mut m0 = _mm_srli_si128(sum, 8);
    sum = _mm_add_epi32(sum, m0);
    m0 = _mm_srli_epi64(sum, 32);
    sum = _mm_add_epi32(sum, m0);
    // double cast avoids a `cdqe` instruction
    _mm_cvtsi128_si32(sum) as u32 as u64
}