zoomvtools 1.1.0

#![allow(clippy::undocumented_unsafe_blocks)]

use std::arch::x86_64::*;
use std::num::{NonZeroU8, NonZeroUsize};

use crate::util::Pixel;

#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_bicubic_kernel_u8_avx2(
    a: __m256i,
    b: __m256i,
    c: __m256i,
    d: __m256i,
    w_ab: __m256i,
    w_cd: __m256i,
    eight: __m256i,
) -> __m256i {
    let ab_lo = _mm256_unpacklo_epi8(a, b);
    let ab_hi = _mm256_unpackhi_epi8(a, b);
    let cd_lo = _mm256_unpacklo_epi8(c, d);
    let cd_hi = _mm256_unpackhi_epi8(c, d);
    let sum_lo = _mm256_add_epi16(
        _mm256_maddubs_epi16(ab_lo, w_ab),
        _mm256_maddubs_epi16(cd_lo, w_cd),
    );
    let sum_hi = _mm256_add_epi16(
        _mm256_maddubs_epi16(ab_hi, w_ab),
        _mm256_maddubs_epi16(cd_hi, w_cd),
    );
    let result_lo = _mm256_srai_epi16(_mm256_add_epi16(sum_lo, eight), 4);
    let result_hi = _mm256_srai_epi16(_mm256_add_epi16(sum_hi, eight), 4);
    _mm256_packus_epi16(result_lo, result_hi)
}

#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_bicubic_kernel_u16_fast_avx2(
    a: __m256i,
    b: __m256i,
    c: __m256i,
    d: __m256i,
    w_ab: __m256i,
    w_cd: __m256i,
    eight: __m256i,
    pixel_max: __m256i,
) -> __m256i {
    let zero = _mm256_setzero_si256();
    let ab_lo = _mm256_unpacklo_epi16(a, b);
    let ab_hi = _mm256_unpackhi_epi16(a, b);
    let cd_lo = _mm256_unpacklo_epi16(c, d);
    let cd_hi = _mm256_unpackhi_epi16(c, d);
    let sum_lo = _mm256_add_epi32(
        _mm256_madd_epi16(ab_lo, w_ab),
        _mm256_madd_epi16(cd_lo, w_cd),
    );
    let sum_hi = _mm256_add_epi32(
        _mm256_madd_epi16(ab_hi, w_ab),
        _mm256_madd_epi16(cd_hi, w_cd),
    );
    let result_lo = _mm256_srai_epi32(_mm256_add_epi32(sum_lo, eight), 4);
    let result_hi = _mm256_srai_epi32(_mm256_add_epi32(sum_hi, eight), 4);
    let clamped_lo = _mm256_min_epi32(pixel_max, _mm256_max_epi32(zero, result_lo));
    let clamped_hi = _mm256_min_epi32(pixel_max, _mm256_max_epi32(zero, result_hi));
    _mm256_packus_epi32(clamped_lo, clamped_hi)
}

#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_bicubic_kernel_u16_exact_avx2(
    a_lo: __m256i,
    a_hi: __m256i,
    b_lo: __m256i,
    b_hi: __m256i,
    c_lo: __m256i,
    c_hi: __m256i,
    d_lo: __m256i,
    d_hi: __m256i,
    nine: __m256i,
    eight: __m256i,
    pixel_max: __m256i,
) -> __m256i {
    let zero = _mm256_setzero_si256();
    let bc_lo = _mm256_mullo_epi32(_mm256_add_epi32(b_lo, c_lo), nine);
    let bc_hi = _mm256_mullo_epi32(_mm256_add_epi32(b_hi, c_hi), nine);
    let ad_lo = _mm256_add_epi32(a_lo, d_lo);
    let ad_hi = _mm256_add_epi32(a_hi, d_hi);
    let result_lo = _mm256_srai_epi32(_mm256_add_epi32(_mm256_sub_epi32(bc_lo, ad_lo), eight), 4);
    let result_hi = _mm256_srai_epi32(_mm256_add_epi32(_mm256_sub_epi32(bc_hi, ad_hi), eight), 4);
    let clamped_lo = _mm256_min_epi32(pixel_max, _mm256_max_epi32(zero, result_lo));
    let clamped_hi = _mm256_min_epi32(pixel_max, _mm256_max_epi32(zero, result_hi));
    _mm256_permute4x64_epi64(_mm256_packus_epi32(clamped_lo, clamped_hi), 0b11_01_10_00)
}

#[target_feature(enable = "avx2")]
pub(super) unsafe fn refine_horizontal_bicubic<T: Pixel>(
    dest: &mut [T],
    src: &[T],
    pitch: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    match size_of::<T>() {
        1 => unsafe {
            refine_horizontal_bicubic_u8(
                src.as_ptr() as *const u8,
                dest.as_mut_ptr() as *mut u8,
                pitch,
                width,
                height,
                bits_per_sample,
            );
        },
        2 => unsafe {
            refine_horizontal_bicubic_u16(
                src.as_ptr() as *const u16,
                dest.as_mut_ptr() as *mut u16,
                pitch,
                width,
                height,
                bits_per_sample,
            );
        },
        _ => unreachable!(),
    }
}

#[target_feature(enable = "avx2")]
pub(super) unsafe fn refine_vertical_bicubic<T: Pixel>(
    dest: &mut [T],
    src: &[T],
    pitch: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    match size_of::<T>() {
        1 => unsafe {
            refine_vertical_bicubic_u8(
                src.as_ptr() as *const u8,
                dest.as_mut_ptr() as *mut u8,
                pitch,
                width,
                height,
                bits_per_sample,
            );
        },
        2 => unsafe {
            refine_vertical_bicubic_u16(
                src.as_ptr() as *const u16,
                dest.as_mut_ptr() as *mut u16,
                pitch,
                width,
                height,
                bits_per_sample,
            );
        },
        _ => unreachable!(),
    }
}

#[target_feature(enable = "avx2")]
// NOTE: Custom implementation. No C implementation exists.
unsafe fn refine_horizontal_bicubic_u8(
    src: *const u8,
    dest: *mut u8,
    pitch: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    let pixel_max = (1u16 << bits_per_sample.get()) - 1;
    let width_val = width.get();
    let pitch_val = pitch.get();
    let w_ab = _mm256_set1_epi16(((9u16 << 8) | ((-1i8 as u8) as u16)) as i16);
    let w_cd = _mm256_set1_epi16(((((-1i8 as u8) as u16) << 8) | 9u16) as i16);
    let eight = _mm256_set1_epi16(8);

    for j in 0..height.get() {
        let row_offset = j * pitch_val;
        let src_row = src.add(row_offset);
        let dest_row = dest.add(row_offset);

        // First pixel: linear interpolation
        let a = *src_row.add(0) as u16;
        let b = *src_row.add(1) as u16;
        *dest_row.add(0) = ((a + b + 1) / 2) as u8;

        let mut i = 1;
        let simd_end = 1 + ((width_val - 4) / 32) * 32;
        while i < simd_end {
            let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
            let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
            let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
            let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
            let result = apply_bicubic_kernel_u8_avx2(a, b, c, d, w_ab, w_cd, eight);
            _mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
            i += 32;
        }

        for i in i..(width_val - 3) {
            let a = *src_row.add(i - 1) as i16;
            let b = *src_row.add(i) as i16;
            let c = *src_row.add(i + 1) as i16;
            let d = *src_row.add(i + 2) as i16;
            let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
            *dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u16) as u8;
        }

        // Second-to-last pixels: linear interpolation
        for i in (width_val - 3)..(width_val - 1) {
            let a = *src_row.add(i) as u16;
            let b = *src_row.add(i + 1) as u16;
            *dest_row.add(i) = ((a + b + 1) / 2) as u8;
        }

        // Last pixel: copy
        *dest_row.add(width_val - 1) = *src_row.add(width_val - 1);
    }
}

#[target_feature(enable = "avx2")]
// NOTE: Custom implementation. No C implementation exists.
unsafe fn refine_horizontal_bicubic_u16(
    src: *const u16,
    dest: *mut u16,
    pitch: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    let pixel_max = (1u32 << bits_per_sample.get()) - 1;
    let width_val = width.get();
    let pitch_val = pitch.get();
    let use_fast = bits_per_sample.get() <= 15;
    let w_ab = _mm256_set1_epi32(((9i32) << 16) | (((-1i32) as u32) & 0xFFFF) as i32);
    let w_cd = _mm256_set1_epi32((((((-1i32) as u32) & 0xFFFF) << 16) | 9u32) as i32);
    let eight = _mm256_set1_epi32(8);
    let nine = _mm256_set1_epi32(9);
    let pixel_max_vec = _mm256_set1_epi32(pixel_max as i32);

    for j in 0..height.get() {
        let row_offset = j * pitch_val;
        let src_row = src.add(row_offset);
        let dest_row = dest.add(row_offset);

        // First pixel: linear interpolation
        let a = *src_row.add(0) as u32;
        let b = *src_row.add(1) as u32;
        *dest_row.add(0) = ((a + b + 1) / 2) as u16;

        let mut i = 1;
        let simd_end = 1 + ((width_val - 4) / 16) * 16;
        if use_fast {
            while i < simd_end {
                let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
                let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
                let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
                let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
                let result = apply_bicubic_kernel_u16_fast_avx2(
                    a,
                    b,
                    c,
                    d,
                    w_ab,
                    w_cd,
                    eight,
                    pixel_max_vec,
                );
                _mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
                i += 16;
            }
        } else {
            while i < simd_end {
                let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
                let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
                let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
                let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
                let a_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(a));
                let a_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(a, 1));
                let b_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b));
                let b_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(b, 1));
                let c_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(c));
                let c_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(c, 1));
                let d_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d));
                let d_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d, 1));
                let result = apply_bicubic_kernel_u16_exact_avx2(
                    a_lo,
                    a_hi,
                    b_lo,
                    b_hi,
                    c_lo,
                    c_hi,
                    d_lo,
                    d_hi,
                    nine,
                    eight,
                    pixel_max_vec,
                );
                _mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
                i += 16;
            }
        }

        for i in i..(width_val - 3) {
            let a = *src_row.add(i - 1) as i32;
            let b = *src_row.add(i) as i32;
            let c = *src_row.add(i + 1) as i32;
            let d = *src_row.add(i + 2) as i32;
            let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
            *dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u32) as u16;
        }

        // Second-to-last pixels: linear interpolation
        for i in (width_val - 3)..(width_val - 1) {
            let a = *src_row.add(i) as u32;
            let b = *src_row.add(i + 1) as u32;
            *dest_row.add(i) = ((a + b + 1) / 2) as u16;
        }

        // Last pixel: copy
        *dest_row.add(width_val - 1) = *src_row.add(width_val - 1);
    }
}

#[target_feature(enable = "avx2")]
// NOTE: Custom implementation. No C implementation exists.
unsafe fn refine_vertical_bicubic_u8(
    src: *const u8,
    dest: *mut u8,
    pitch: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    let pixel_max = (1u16 << bits_per_sample.get()) - 1;
    let width_val = width.get();
    let pitch_val = pitch.get();
    let height_val = height.get();
    let w_ab = _mm256_set1_epi16(((9u16 << 8) | ((-1i8 as u8) as u16)) as i16);
    let w_cd = _mm256_set1_epi16(((((-1i8 as u8) as u16) << 8) | 9u16) as i16);
    let eight = _mm256_set1_epi16(8);

    // First row: linear interpolation
    for i in 0..width_val {
        let a = *src.add(i) as u16;
        let b = *src.add(i + pitch_val) as u16;
        *dest.add(i) = ((a + b + 1) / 2) as u8;
    }

    // Middle rows: bicubic interpolation
    for j in 1..(height_val - 3) {
        let offset = j * pitch_val;
        let src_prev = src.add(offset - pitch_val);
        let src_curr = src.add(offset);
        let src_next = src.add(offset + pitch_val);
        let src_next2 = src.add(offset + pitch_val * 2);
        let dest_row = dest.add(offset);

        let mut i = 0;
        let simd_end = (width_val / 32) * 32;
        while i < simd_end {
            let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
            let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
            let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
            let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
            let result = apply_bicubic_kernel_u8_avx2(a, b, c, d, w_ab, w_cd, eight);
            _mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
            i += 32;
        }

        for i in i..width_val {
            let a = *src_prev.add(i) as i16;
            let b = *src_curr.add(i) as i16;
            let c = *src_next.add(i) as i16;
            let d = *src_next2.add(i) as i16;
            let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
            *dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u16) as u8;
        }
    }

    // Second-to-last rows: linear interpolation
    for j in (height_val - 3)..(height_val - 1) {
        let offset = j * pitch_val;

        for i in 0..width_val {
            let a = *src.add(offset + i) as u16;
            let b = *src.add(offset + i + pitch_val) as u16;
            *dest.add(offset + i) = ((a + b + 1) / 2) as u8;
        }
    }

    // Last row: copy
    let last_offset = (height_val - 1) * pitch_val;
    std::ptr::copy_nonoverlapping(src.add(last_offset), dest.add(last_offset), width_val);
}

#[target_feature(enable = "avx2")]
// NOTE: Custom implementation. No C implementation exists.
unsafe fn refine_vertical_bicubic_u16(
    src: *const u16,
    dest: *mut u16,
    pitch: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    let pixel_max = (1u32 << bits_per_sample.get()) - 1;
    let width_val = width.get();
    let pitch_val = pitch.get();
    let height_val = height.get();
    let use_fast = bits_per_sample.get() <= 15;
    let w_ab = _mm256_set1_epi32(((9i32) << 16) | (((-1i32) as u32) & 0xFFFF) as i32);
    let w_cd = _mm256_set1_epi32((((((-1i32) as u32) & 0xFFFF) << 16) | 9u32) as i32);
    let eight = _mm256_set1_epi32(8);
    let nine = _mm256_set1_epi32(9);
    let pixel_max_vec = _mm256_set1_epi32(pixel_max as i32);

    // First row: linear interpolation
    for i in 0..width_val {
        let a = *src.add(i) as u32;
        let b = *src.add(i + pitch_val) as u32;
        *dest.add(i) = ((a + b + 1) / 2) as u16;
    }

    // Middle rows: bicubic interpolation
    for j in 1..(height_val - 3) {
        let offset = j * pitch_val;
        let src_prev = src.add(offset - pitch_val);
        let src_curr = src.add(offset);
        let src_next = src.add(offset + pitch_val);
        let src_next2 = src.add(offset + pitch_val * 2);
        let dest_row = dest.add(offset);

        let mut i = 0;
        let simd_end = (width_val / 16) * 16;
        if use_fast {
            while i < simd_end {
                let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
                let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
                let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
                let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
                let result = apply_bicubic_kernel_u16_fast_avx2(
                    a,
                    b,
                    c,
                    d,
                    w_ab,
                    w_cd,
                    eight,
                    pixel_max_vec,
                );
                _mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
                i += 16;
            }
        } else {
            while i < simd_end {
                let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
                let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
                let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
                let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
                let a_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(a));
                let a_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(a, 1));
                let b_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b));
                let b_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(b, 1));
                let c_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(c));
                let c_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(c, 1));
                let d_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d));
                let d_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d, 1));
                let result = apply_bicubic_kernel_u16_exact_avx2(
                    a_lo,
                    a_hi,
                    b_lo,
                    b_hi,
                    c_lo,
                    c_hi,
                    d_lo,
                    d_hi,
                    nine,
                    eight,
                    pixel_max_vec,
                );
                _mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
                i += 16;
            }
        }

        for i in i..width_val {
            let a = *src_prev.add(i) as i32;
            let b = *src_curr.add(i) as i32;
            let c = *src_next.add(i) as i32;
            let d = *src_next2.add(i) as i32;
            let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
            *dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u32) as u16;
        }
    }

    // Second-to-last rows: linear interpolation
    for j in (height_val - 3)..(height_val - 1) {
        let offset = j * pitch_val;

        for i in 0..width_val {
            let a = *src.add(offset + i) as u32;
            let b = *src.add(offset + i + pitch_val) as u32;
            *dest.add(offset + i) = ((a + b + 1) / 2) as u16;
        }
    }

    // Last row: copy
    let last_offset = (height_val - 1) * pitch_val;
    std::ptr::copy_nonoverlapping(src.add(last_offset), dest.add(last_offset), width_val);
}