fast_image_resize 6.0.0

use core::arch::x86_64::*;

use super::native;
use crate::convolution::{Coefficients, CoefficientsChunk};
use crate::pixels::InnerPixel;
use crate::{simd_utils, ImageView, ImageViewMut};

pub(crate) fn vert_convolution<T>(
    src_view: &impl ImageView<Pixel = T>,
    dst_view: &mut impl ImageViewMut<Pixel = T>,
    offset: u32,
    coeffs: &Coefficients,
) where
    T: InnerPixel<Component = f32>,
{
    let coefficients_chunks = coeffs.get_chunks();
    let src_x = offset as usize * T::count_of_components();

    let dst_rows = dst_view.iter_rows_mut(0);
    for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) {
        unsafe {
            vert_convolution_into_one_row_f32(src_view, dst_row, src_x, coeffs_chunk);
        }
    }
}

#[target_feature(enable = "sse4.1")]
unsafe fn vert_convolution_into_one_row_f32<T: InnerPixel<Component = f32>>(
    src_view: &impl ImageView<Pixel = T>,
    dst_row: &mut [T],
    mut src_x: usize,
    coeffs_chunk: CoefficientsChunk,
) {
    let mut c_buf = [0f64; 2];
    let mut dst_f32 = T::components_mut(dst_row);

    let mut dst_chunks = dst_f32.chunks_exact_mut(16);
    for dst_chunk in &mut dst_chunks {
        multiply_components_of_rows::<_, 8>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
        src_x += 16;
    }

    dst_f32 = dst_chunks.into_remainder();
    dst_chunks = dst_f32.chunks_exact_mut(8);
    for dst_chunk in &mut dst_chunks {
        multiply_components_of_rows::<_, 4>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
        src_x += 8;
    }

    dst_f32 = dst_chunks.into_remainder();
    dst_chunks = dst_f32.chunks_exact_mut(4);
    if let Some(dst_chunk) = dst_chunks.next() {
        multiply_components_of_rows::<_, 2>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
        src_x += 4;
    }

    dst_f32 = dst_chunks.into_remainder();
    if !dst_f32.is_empty() {
        let y_start = coeffs_chunk.start;
        let coeffs = coeffs_chunk.values;
        native::convolution_by_f32(src_view, dst_f32, src_x, y_start, coeffs);
    }
}

#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn multiply_components_of_rows<
    T: InnerPixel<Component = f32>,
    const SUMS_COUNT: usize,
>(
    src_view: &impl ImageView<Pixel = T>,
    src_x: usize,
    coeffs_chunk: CoefficientsChunk,
    dst_chunk: &mut [f32],
    c_buf: &mut [f64; 2],
) {
    let mut sums = [_mm_set1_pd(0.); SUMS_COUNT];
    let y_start = coeffs_chunk.start;
    let mut coeffs = coeffs_chunk.values;
    let mut y: u32 = 0;
    let max_rows = coeffs.len() as u32;

    let coeffs_2 = coeffs.chunks_exact(2);
    coeffs = coeffs_2.remainder();
    for (src_rows, two_coeffs) in src_view.iter_2_rows(y_start, max_rows).zip(coeffs_2) {
        let src_rows = src_rows.map(|row| T::components(row).get_unchecked(src_x..));
        for (&coeff, src_row) in two_coeffs.iter().zip(src_rows) {
            multiply_components_of_row(&mut sums, coeff, src_row);
        }
        y += 2;
    }

    if let Some(&coeff) = coeffs.first() {
        if let Some(s_row) = src_view.iter_rows(y_start + y).next() {
            let src_row = T::components(s_row).get_unchecked(src_x..);
            multiply_components_of_row(&mut sums, coeff, src_row);
        }
    }

    let mut dst_ptr = dst_chunk.as_mut_ptr();
    for sum in sums {
        _mm_storeu_pd(c_buf.as_mut_ptr(), sum);
        for &v in c_buf.iter() {
            *dst_ptr = v as f32;
            dst_ptr = dst_ptr.add(1);
        }
    }
}

#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn multiply_components_of_row<const SUMS_COUNT: usize>(
    sums: &mut [__m128d; SUMS_COUNT],
    coeff: f64,
    src_row: &[f32],
) {
    let coeff_f64x2 = _mm_set1_pd(coeff);
    let mut i = 0;
    while i < SUMS_COUNT {
        let comp03_f32x4 = simd_utils::loadu_ps(src_row, i * 2);

        let comp01_f64x2 = _mm_cvtps_pd(comp03_f32x4);
        sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(comp01_f64x2, coeff_f64x2));
        i += 1;

        let comp23_f64x2 = _mm_cvtps_pd(_mm_movehl_ps(comp03_f32x4, comp03_f32x4));
        sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(comp23_f64x2, coeff_f64x2));
        i += 1;
    }
}