zoomvtools 1.1.1

#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
mod avx2;
#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
mod avx512;
mod rust;

#[cfg(test)]
mod tests;

use std::num::{NonZeroU8, NonZeroUsize};

use crate::util::Pixel;

/// Performs horizontal bicubic interpolation for sub-pixel motion estimation refinement.
///
/// This function applies bicubic interpolation horizontally to create sub-pixel samples
/// between existing pixels. Bicubic interpolation uses a 4-tap kernel that considers
/// 4 horizontal neighbors, providing smooth and high-quality interpolation suitable
/// for motion estimation with sub-pixel accuracy.
///
/// Edge pixels use simple averaging due to insufficient neighbors for the full kernel.
///
/// # Parameters
/// - `src`: Source image buffer
/// - `dest`: Destination buffer for interpolated results
/// - `pitch`: Number of pixels per row in both buffers
/// - `width`: Width of the image in pixels
/// - `height`: Height of the image in pixels
/// - `bits_per_sample`: Bit depth of the pixel format for clamping
#[inline]
pub fn refine_horizontal_bicubic<T: Pixel>(
    dest: &mut [T],
    src: &[T],
    pitch: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    debug_assert!(
        bits_per_sample.get() as usize > (size_of::<T>() - 1) * 8
            && (bits_per_sample.get() as usize <= size_of::<T>() * 8)
    );

    #[cfg(all(target_arch = "x86_64", feature = "avx512"))]
    if crate::util::has_avx512_skylake() {
        // PERF(znver4): 3-5% faster than AVX2 for 8-bit
        // PERF(znver4): 5-10% faster than AVX2 for 9-15-bit
        // PERF(znver4): Equivalent to AVX2 for 16-bit
        // PERF(znver5): 3-5% faster than AVX2 for 8-bit
        // PERF(znver5): 5-10% faster than AVX2 for 9-15-bit
        // PERF(znver5): 15% faster than AVX2 for 16-bit
        // SAFETY: We check for AVX-512 first
        unsafe {
            avx512::refine_horizontal_bicubic(dest, src, pitch, width, height, bits_per_sample);
        }
        return;
    }

    #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
    if crate::util::has_avx2() {
        // PERF: 85% faster than scalar for 8-bit
        // PERF: 70% faster than scalar for 9-15-bit
        // PERF: 60% faster than scalar for 16-bit
        // SAFETY: We check for AVX2 first
        unsafe {
            avx2::refine_horizontal_bicubic(dest, src, pitch, width, height, bits_per_sample);
        }
        return;
    }

    rust::refine_horizontal_bicubic(dest, src, pitch, width, height, bits_per_sample);
}

/// Performs vertical bicubic interpolation for sub-pixel motion estimation refinement.
///
/// This function applies bicubic interpolation vertically to create sub-pixel samples
/// between existing pixels. Bicubic interpolation uses a 4-tap kernel that considers
/// 4 vertical neighbors, providing smooth and high-quality interpolation suitable
/// for motion estimation with sub-pixel accuracy.
///
/// Edge rows use simple averaging due to insufficient neighbors for the full kernel,
/// and the last row is copied directly from the source.
///
/// # Parameters
/// - `src`: Source image buffer
/// - `dest`: Destination buffer for interpolated results
/// - `pitch`: Number of pixels per row in both buffers
/// - `width`: Width of the image in pixels
/// - `height`: Height of the image in pixels
/// - `bits_per_sample`: Bit depth of the pixel format for clamping
#[inline]
pub fn refine_vertical_bicubic<T: Pixel>(
    dest: &mut [T],
    src: &[T],
    pitch: NonZeroUsize,
    width: NonZeroUsize,
    height: NonZeroUsize,
    bits_per_sample: NonZeroU8,
) {
    debug_assert!(
        bits_per_sample.get() as usize > (size_of::<T>() - 1) * 8
            && (bits_per_sample.get() as usize <= size_of::<T>() * 8)
    );

    #[cfg(all(target_arch = "x86_64", feature = "avx512"))]
    if crate::util::has_avx512_znver5() {
        // PERF(znver5): 15% faster than AVX2 for 8-bit
        // PERF(znver5): 15% faster than AVX2 for 9-15-bit
        // PERF(znver5): 35% faster than AVX2 for 16-bit
        // SAFETY: We check for AVX-512 first
        unsafe {
            avx512::refine_vertical_bicubic(dest, src, pitch, width, height, bits_per_sample);
        }
        return;
    }

    #[cfg(all(target_arch = "x86_64", feature = "avx512"))]
    if crate::util::has_avx512_skylake() {
        cfg_select! {
            feature = "experimental" => {
                // SAFETY: We check for AVX-512 first
                unsafe {
                    avx512::refine_vertical_bicubic(dest, src, pitch, width, height, bits_per_sample);
                }
                return;
            }
            _ => {
                // PERF(znver4): 3-5% faster than AVX2 for 8-bit
                // PERF(znver4): 5-10% faster than AVX2 for 9-15-bit
                // TODO(znver4): 10% slower than AVX2 for 16-bit
                if size_of::<T>() == 1 || bits_per_sample.get() < 16 {
                    // SAFETY: We check for AVX-512 first
                    unsafe {
                        avx512::refine_vertical_bicubic(dest, src, pitch, width, height, bits_per_sample);
                    }
                    return;
                }
            }
        }
    }

    #[cfg(all(target_arch = "x86_64", feature = "avx2"))]
    if crate::util::has_avx2() {
        // PERF: 80% faster than scalar for 8-bit
        // PERF: 45% faster than scalar for 9-15-bit
        // PERF: 20% faster than scalar for 16-bit
        // SAFETY: We check for AVX2 first
        unsafe {
            avx2::refine_vertical_bicubic(dest, src, pitch, width, height, bits_per_sample);
        }
        return;
    }

    rust::refine_vertical_bicubic(dest, src, pitch, width, height, bits_per_sample);
}