use crate::color_group::{ColorGroup, ld_g, st_g_mixed};
use crate::filter_weights::FilterWeights;
use crate::mixed_storage::MixedStorage;
use num_traits::{AsPrimitive, Float, MulAdd};
use std::ops::{Add, Mul};
pub(crate) fn convolve_row_handler_floating_point<
T: Copy + 'static + AsPrimitive<J> + Default,
J: Copy
+ 'static
+ AsPrimitive<T>
+ MulAdd<J, Output = J>
+ Mul<J, Output = J>
+ Add<J, Output = J>
+ Default
+ MixedStorage<T>,
F: Copy + 'static + Float + AsPrimitive<J>,
const CN: usize,
>(
src: &[T],
dst: &mut [T],
filter_weights: &FilterWeights<F>,
bit_depth: u32,
) where
i32: AsPrimitive<J>,
{
for ((chunk, &bounds), weights) in dst
.as_chunks_mut::<CN>()
.0
.iter_mut()
.zip(filter_weights.bounds.iter())
.zip(
filter_weights
.weights
.chunks_exact(filter_weights.aligned_size),
)
{
let mut sums = ColorGroup::<CN, J>::dup(0.as_());
let start_x = bounds.start;
let px = start_x * CN;
let src_ptr0 = &src[px..(px + bounds.size * CN)];
for (&k_weight, src) in weights[..bounds.size]
.iter()
.zip(src_ptr0.as_chunks::<CN>().0.iter())
{
let weight: J = k_weight.as_();
let new_px = ld_g!(src, CN, J);
sums = sums.mul_add(new_px, weight);
}
st_g_mixed!(sums, chunk, CN, bit_depth);
}
}
pub(crate) fn convolve_row_handler_floating_point_4<
T: Copy + 'static + AsPrimitive<J> + Default,
J: Copy
+ 'static
+ AsPrimitive<T>
+ MulAdd<J, Output = J>
+ Mul<J, Output = J>
+ Add<J, Output = J>
+ Default
+ MixedStorage<T>,
F: Copy + 'static + Float + AsPrimitive<J>,
const CN: usize,
>(
src: &[T],
src_stride: usize,
dst: &mut [T],
dst_stride: usize,
filter_weights: &FilterWeights<F>,
bit_depth: u32,
) where
i32: AsPrimitive<J>,
{
let (row0_ref, rest) = dst.split_at_mut(dst_stride);
let (row1_ref, rest) = rest.split_at_mut(dst_stride);
let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
let iter_row0 = row0_ref.as_chunks_mut::<CN>().0.iter_mut();
let iter_row1 = row1_ref.as_chunks_mut::<CN>().0.iter_mut();
let iter_row2 = row2_ref.as_chunks_mut::<CN>().0.iter_mut();
let iter_row3 = row3_ref.as_chunks_mut::<CN>().0.iter_mut();
for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
.zip(iter_row1)
.zip(iter_row2)
.zip(iter_row3)
.zip(filter_weights.bounds.iter())
.zip(
filter_weights
.weights
.chunks_exact(filter_weights.aligned_size),
)
{
let mut sums0 = ColorGroup::<CN, J>::dup(0.as_());
let mut sums1 = ColorGroup::<CN, J>::dup(0.as_());
let mut sums2 = ColorGroup::<CN, J>::dup(0.as_());
let mut sums3 = ColorGroup::<CN, J>::dup(0.as_());
let start_x = bounds.start;
let px = start_x * CN;
let src_ptr0 = &src[px..(px + bounds.size * CN)];
let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds.size * CN)];
let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds.size * CN)];
let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds.size * CN)];
for ((((&k_weight, src0), src1), src2), src3) in weights[..bounds.size]
.iter()
.zip(src_ptr0.as_chunks::<CN>().0.iter())
.zip(src_ptr1.as_chunks::<CN>().0.iter())
.zip(src_ptr2.as_chunks::<CN>().0.iter())
.zip(src_ptr3.as_chunks::<CN>().0.iter())
{
let weight: J = k_weight.as_();
let new_px0 = ld_g!(src0, CN, J);
let new_px1 = ld_g!(src1, CN, J);
let new_px2 = ld_g!(src2, CN, J);
let new_px3 = ld_g!(src3, CN, J);
sums0 = sums0.mul_add(new_px0, weight);
sums1 = sums1.mul_add(new_px1, weight);
sums2 = sums2.mul_add(new_px2, weight);
sums3 = sums3.mul_add(new_px3, weight);
}
st_g_mixed!(sums0, chunk0, CN, bit_depth);
st_g_mixed!(sums1, chunk1, CN, bit_depth);
st_g_mixed!(sums2, chunk2, CN, bit_depth);
st_g_mixed!(sums3, chunk3, CN, bit_depth);
}
}