zoomvtools 2.0.0

#![allow(clippy::undocumented_unsafe_blocks)]

use std::{arch::x86_64::*, mem::size_of, num::NonZeroUsize};

use cpudetect::target_family;

use crate::util::Pixel;

#[target_family("x86_64_v3")]
pub(super) unsafe fn reduce_triangle<T: Pixel>(
    dest: &mut [T],
    src: &[T],
    dest_pitch: NonZeroUsize,
    src_pitch: NonZeroUsize,
    dest_width: NonZeroUsize,
    dest_height: NonZeroUsize,
) {
    // Check the array bounds once at the start of the loop.
    debug_assert!(src.len() >= src_pitch.get() * dest_height.get() * 2);
    debug_assert!(dest.len() >= dest_pitch.get() * dest_height.get());

    match size_of::<T>() {
        1 => unsafe {
            reduce_triangle_vertical_u8(
                dest.as_mut_ptr() as *mut u8,
                src.as_ptr() as *const u8,
                dest_pitch,
                src_pitch,
                // SAFETY: non-zero constant
                dest_width.saturating_mul(NonZeroUsize::new_unchecked(2)),
                dest_height,
            );
            reduce_triangle_horizontal_inplace_u8(
                dest.as_mut_ptr() as *mut u8,
                dest_pitch,
                dest_width,
                dest_height,
            );
        },
        2 => unsafe {
            reduce_triangle_vertical_u16(
                dest.as_mut_ptr() as *mut u16,
                src.as_ptr() as *const u16,
                dest_pitch,
                src_pitch,
                // SAFETY: non-zero constant
                dest_width.saturating_mul(NonZeroUsize::new_unchecked(2)),
                dest_height,
            );
            reduce_triangle_horizontal_inplace_u16(
                dest.as_mut_ptr() as *mut u16,
                dest_pitch,
                dest_width,
                dest_height,
            );
        },
        _ => unreachable!(),
    }
}

#[target_family("x86_64_v3")]
// NOTE: Custom implementation. No C implementation exists.
unsafe fn reduce_triangle_vertical_u8(
    dest: *mut u8,
    src: *const u8,
    dest_pitch: NonZeroUsize,
    src_pitch: NonZeroUsize,
    dest_width: NonZeroUsize,
    dest_height: NonZeroUsize,
) {
    let width_usize = dest_width.get();
    let height_usize = dest_height.get();
    let src_pitch_usize = src_pitch.get();
    let dest_pitch_usize = dest_pitch.get();

    // Process first output row: average of first two input rows
    let mut x = 0;
    while x + 32 <= width_usize {
        let a = _mm256_loadu_si256(src.add(x) as *const __m256i);
        let b = _mm256_loadu_si256(src.add(x + src_pitch_usize) as *const __m256i);

        // Convert to u16 for intermediate calculations
        let a_lo = _mm256_unpacklo_epi8(a, _mm256_setzero_si256());
        let a_hi = _mm256_unpackhi_epi8(a, _mm256_setzero_si256());
        let b_lo = _mm256_unpacklo_epi8(b, _mm256_setzero_si256());
        let b_hi = _mm256_unpackhi_epi8(b, _mm256_setzero_si256());

        // Calculate (a + b + 1) / 2
        let ones = _mm256_set1_epi16(1);
        let sum_lo = _mm256_add_epi16(_mm256_add_epi16(a_lo, b_lo), ones);
        let sum_hi = _mm256_add_epi16(_mm256_add_epi16(a_hi, b_hi), ones);
        let result_lo = _mm256_srli_epi16(sum_lo, 1);
        let result_hi = _mm256_srli_epi16(sum_hi, 1);

        // Pack back to u8
        let result = _mm256_packus_epi16(result_lo, result_hi);
        _mm256_storeu_si256(dest.add(x) as *mut __m256i, result);

        x += 32;
    }

    // Handle remaining pixels
    while x < width_usize {
        let a = *src.add(x) as u16;
        let b = *src.add(x + src_pitch_usize) as u16;
        *dest.add(x) = ((a + b + 1) / 2) as u8;
        x += 1;
    }

    // Process remaining output rows: 1/4, 1/2, 1/4 filter
    for y in 1..height_usize {
        let dest_offset = y * dest_pitch_usize;
        let src_offset = y * 2 * src_pitch_usize;

        let mut x = 0;
        while x + 32 <= width_usize {
            let a = _mm256_loadu_si256(src.add(src_offset + x - src_pitch_usize) as *const __m256i);
            let b = _mm256_loadu_si256(src.add(src_offset + x) as *const __m256i);
            let c = _mm256_loadu_si256(src.add(src_offset + x + src_pitch_usize) as *const __m256i);

            // Convert to u16 for intermediate calculations
            let a_lo = _mm256_unpacklo_epi8(a, _mm256_setzero_si256());
            let a_hi = _mm256_unpackhi_epi8(a, _mm256_setzero_si256());
            let b_lo = _mm256_unpacklo_epi8(b, _mm256_setzero_si256());
            let b_hi = _mm256_unpackhi_epi8(b, _mm256_setzero_si256());
            let c_lo = _mm256_unpacklo_epi8(c, _mm256_setzero_si256());
            let c_hi = _mm256_unpackhi_epi8(c, _mm256_setzero_si256());

            // Calculate (a + b * 2 + c + 2) / 4
            let twos = _mm256_set1_epi16(2);
            let b2_lo = _mm256_slli_epi16(b_lo, 1);
            let b2_hi = _mm256_slli_epi16(b_hi, 1);
            let sum_lo =
                _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a_lo, b2_lo), c_lo), twos);
            let sum_hi =
                _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a_hi, b2_hi), c_hi), twos);
            let result_lo = _mm256_srli_epi16(sum_lo, 2);
            let result_hi = _mm256_srli_epi16(sum_hi, 2);

            // Pack back to u8
            let result = _mm256_packus_epi16(result_lo, result_hi);
            _mm256_storeu_si256(dest.add(dest_offset + x) as *mut __m256i, result);

            x += 32;
        }

        // Handle remaining pixels
        while x < width_usize {
            let a = *src.add(src_offset + x - src_pitch_usize) as u16;
            let b = *src.add(src_offset + x) as u16;
            let c = *src.add(src_offset + x + src_pitch_usize) as u16;
            *dest.add(dest_offset + x) = ((a + b * 2 + c + 2) / 4) as u8;
            x += 1;
        }
    }
}

#[target_family("x86_64_v3")]
// NOTE: Custom implementation. No C implementation exists.
unsafe fn reduce_triangle_horizontal_inplace_u8(
    dest: *mut u8,
    dest_pitch: NonZeroUsize,
    dest_width: NonZeroUsize,
    dest_height: NonZeroUsize,
) {
    let width_usize = dest_width.get();
    let height_usize = dest_height.get();
    let dest_pitch_usize = dest_pitch.get();

    let w_ab = _mm256_set1_epi16(0x0201);
    let w_c = _mm256_set1_epi16(0x0001);
    let round16 = _mm256_set1_epi16(2);

    for y in 0..height_usize {
        let row_offset = y * dest_pitch_usize;

        // First pixel: simple average
        let b = *dest.add(row_offset) as u16;
        let c = *dest.add(row_offset + 1) as u16;
        let src0 = ((b + c + 1) / 2) as u8;

        let mut x = 1;

        while x + 16 < width_usize {
            let ab = _mm256_loadu_si256(dest.add(row_offset + x * 2 - 1) as *const __m256i);
            let c0 = _mm256_loadu_si256(dest.add(row_offset + x * 2 + 1) as *const __m256i);

            let term_ab = _mm256_maddubs_epi16(ab, w_ab);
            let term_c = _mm256_maddubs_epi16(c0, w_c);
            let sum = _mm256_add_epi16(_mm256_add_epi16(term_ab, term_c), round16);
            let reduced = _mm256_srli_epi16(sum, 2);

            let packed = _mm256_packus_epi16(reduced, reduced);
            let reordered = _mm256_permute4x64_epi64(packed, 0b1101_1000);
            _mm_storeu_si128(
                dest.add(row_offset + x) as *mut __m128i,
                _mm256_castsi256_si128(reordered),
            );

            x += 16;
        }

        while x < width_usize {
            let pixel_offset = row_offset + x * 2 - 1;
            let a = *dest.add(pixel_offset) as u16;
            let b = *dest.add(pixel_offset + 1) as u16;
            let c = *dest.add(pixel_offset + 2) as u16;
            *dest.add(row_offset + x) = ((a + b * 2 + c + 2) / 4) as u8;
            x += 1;
        }

        // Store the first pixel
        *dest.add(row_offset) = src0;
    }
}

#[target_family("x86_64_v3")]
// NOTE: Custom implementation. No C implementation exists.
unsafe fn reduce_triangle_vertical_u16(
    dest: *mut u16,
    src: *const u16,
    dest_pitch: NonZeroUsize,
    src_pitch: NonZeroUsize,
    dest_width: NonZeroUsize,
    dest_height: NonZeroUsize,
) {
    let width_usize = dest_width.get();
    let height_usize = dest_height.get();
    let src_pitch_usize = src_pitch.get();
    let dest_pitch_usize = dest_pitch.get();

    // Process first output row: average of first two input rows
    let mut x = 0;
    while x + 16 <= width_usize {
        let a = _mm256_loadu_si256(src.add(x) as *const __m256i);
        let b = _mm256_loadu_si256(src.add(x + src_pitch_usize) as *const __m256i);

        // Convert to u32 for intermediate calculations
        let a_lo = _mm256_unpacklo_epi16(a, _mm256_setzero_si256());
        let a_hi = _mm256_unpackhi_epi16(a, _mm256_setzero_si256());
        let b_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
        let b_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());

        // Calculate (a + b + 1) / 2
        let ones = _mm256_set1_epi32(1);
        let sum_lo = _mm256_add_epi32(_mm256_add_epi32(a_lo, b_lo), ones);
        let sum_hi = _mm256_add_epi32(_mm256_add_epi32(a_hi, b_hi), ones);
        let result_lo = _mm256_srli_epi32(sum_lo, 1);
        let result_hi = _mm256_srli_epi32(sum_hi, 1);

        // Pack back to u16
        let result = _mm256_packus_epi32(result_lo, result_hi);
        _mm256_storeu_si256(dest.add(x) as *mut __m256i, result);

        x += 16;
    }

    // Handle remaining pixels
    while x < width_usize {
        let a = *src.add(x) as u32;
        let b = *src.add(x + src_pitch_usize) as u32;
        *dest.add(x) = ((a + b + 1) / 2) as u16;
        x += 1;
    }

    // Process remaining output rows: 1/4, 1/2, 1/4 filter
    for y in 1..height_usize {
        let dest_offset = y * dest_pitch_usize;
        let src_offset = y * 2 * src_pitch_usize;

        let mut x = 0;
        while x + 16 <= width_usize {
            let a = _mm256_loadu_si256(src.add(src_offset + x - src_pitch_usize) as *const __m256i);
            let b = _mm256_loadu_si256(src.add(src_offset + x) as *const __m256i);
            let c = _mm256_loadu_si256(src.add(src_offset + x + src_pitch_usize) as *const __m256i);

            // Convert to u32 for intermediate calculations
            let a_lo = _mm256_unpacklo_epi16(a, _mm256_setzero_si256());
            let a_hi = _mm256_unpackhi_epi16(a, _mm256_setzero_si256());
            let b_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
            let b_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
            let c_lo = _mm256_unpacklo_epi16(c, _mm256_setzero_si256());
            let c_hi = _mm256_unpackhi_epi16(c, _mm256_setzero_si256());

            // Calculate (a + b * 2 + c + 2) / 4
            let twos = _mm256_set1_epi32(2);
            let b2_lo = _mm256_slli_epi32(b_lo, 1);
            let b2_hi = _mm256_slli_epi32(b_hi, 1);
            let sum_lo =
                _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(a_lo, b2_lo), c_lo), twos);
            let sum_hi =
                _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(a_hi, b2_hi), c_hi), twos);
            let result_lo = _mm256_srli_epi32(sum_lo, 2);
            let result_hi = _mm256_srli_epi32(sum_hi, 2);

            // Pack back to u16
            let result = _mm256_packus_epi32(result_lo, result_hi);
            _mm256_storeu_si256(dest.add(dest_offset + x) as *mut __m256i, result);

            x += 16;
        }

        // Handle remaining pixels
        while x < width_usize {
            let a = *src.add(src_offset + x - src_pitch_usize) as u32;
            let b = *src.add(src_offset + x) as u32;
            let c = *src.add(src_offset + x + src_pitch_usize) as u32;
            *dest.add(dest_offset + x) = ((a + b * 2 + c + 2) / 4) as u16;
            x += 1;
        }
    }
}

#[target_family("x86_64_v3")]
// NOTE: Custom implementation. No C implementation exists.
unsafe fn reduce_triangle_horizontal_inplace_u16(
    dest: *mut u16,
    dest_pitch: NonZeroUsize,
    dest_width: NonZeroUsize,
    dest_height: NonZeroUsize,
) {
    let width_usize = dest_width.get();
    let height_usize = dest_height.get();
    let dest_pitch_usize = dest_pitch.get();

    let sign_bit = _mm256_set1_epi16(i16::MIN);
    let w_ab = _mm256_set1_epi32(0x0002_0001);
    let w_c = _mm256_set1_epi32(0x0000_0001);
    let round32 = _mm256_set1_epi32(131_074);

    for y in 0..height_usize {
        let row_offset = y * dest_pitch_usize;

        // First pixel: simple average
        let b = *dest.add(row_offset) as u32;
        let c = *dest.add(row_offset + 1) as u32;
        let src0 = ((b + c + 1) / 2) as u16;

        let mut x = 1;

        while x + 8 < width_usize {
            let ab = _mm256_loadu_si256(dest.add(row_offset + x * 2 - 1) as *const __m256i);
            let c0 = _mm256_loadu_si256(dest.add(row_offset + x * 2 + 1) as *const __m256i);

            let term_ab = _mm256_madd_epi16(_mm256_xor_si256(ab, sign_bit), w_ab);
            let term_c = _mm256_madd_epi16(_mm256_xor_si256(c0, sign_bit), w_c);
            let reduced = _mm256_srli_epi32(
                _mm256_add_epi32(_mm256_add_epi32(term_ab, term_c), round32),
                2,
            );

            let packed = _mm256_packus_epi32(reduced, reduced);
            let reordered = _mm256_permute4x64_epi64(packed, 0b1101_1000);
            _mm_storeu_si128(
                dest.add(row_offset + x) as *mut __m128i,
                _mm256_castsi256_si128(reordered),
            );

            x += 8;
        }

        while x < width_usize {
            let pixel_offset = row_offset + x * 2 - 1;
            let a = *dest.add(pixel_offset) as u32;
            let b = *dest.add(pixel_offset + 1) as u32;
            let c = *dest.add(pixel_offset + 2) as u32;
            *dest.add(row_offset + x) = ((a + b * 2 + c + 2) / 4) as u16;
            x += 1;
        }

        // Store the first pixel
        *dest.add(row_offset) = src0;
    }
}