use core::convert::TryInto;
use core::simd::prelude::*;
use core::simd::Select;
#[cfg(not(target_arch = "x86_64"))]
use super::paeth::filter_paeth as filter_paeth_chosen;
#[cfg(target_arch = "x86_64")]
use super::paeth::filter_paeth_stbi as filter_paeth_chosen;
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn paeth_predictor_simd<const SIZE: usize>(
a_i16: Simd<i16, SIZE>,
b_i16: Simd<i16, SIZE>,
c_i16: Simd<i16, SIZE>,
) -> Simd<u8, SIZE> {
let pa = (b_i16 - c_i16).abs();
let pb = (a_i16 - c_i16).abs();
let pc = (a_i16 + b_i16 - c_i16 - c_i16).abs();
let mut nearest = a_i16;
let mut min_dist = pa;
let pb_lt_min = pb.simd_lt(min_dist);
nearest = pb_lt_min.select(b_i16, nearest);
min_dist = pb_lt_min.select(pb, min_dist);
let pc_lt_min = pc.simd_lt(min_dist);
nearest = pc_lt_min.select(c_i16, nearest);
nearest.cast::<u8>()
}
#[cfg(not(target_arch = "aarch64"))]
#[inline(always)]
fn paeth_predictor_simd<const SIZE: usize>(
a_i16: Simd<i16, SIZE>,
b_i16: Simd<i16, SIZE>,
c_i16: Simd<i16, SIZE>,
) -> Simd<u8, SIZE> {
let thresh = c_i16 * Simd::splat(3) - (a_i16 + b_i16);
let lo = a_i16.simd_min(b_i16);
let hi = a_i16.simd_max(b_i16);
let t0 = hi.simd_le(thresh).select(lo, c_i16);
thresh.simd_le(lo).select(hi, t0).cast::<u8>()
}
#[inline(always)]
fn process_paeth_chunk_bpp3_s48(
mut current_a: Simd<u8, 3>,
b_vec: &Simd<u8, 48>,
c_vec_initial: Simd<u8, 3>,
x_out: &mut Simd<u8, 48>,
) -> Simd<u8, 3> {
let x_in = *x_out;
let mut preds = [0u8; 48];
let mut c_vec = b_vec.shift_elements_right::<3>(0u8);
c_vec.as_mut_array()[0..3].copy_from_slice(c_vec_initial.as_array());
macro_rules! process_pixel {
($shift:expr) => {
let a_i16 = current_a.cast::<i16>();
let b_i16 = b_vec.extract::<$shift, 3>().cast::<i16>();
let c_i16 = c_vec.extract::<$shift, 3>().cast::<i16>();
let pred = paeth_predictor_simd(a_i16, b_i16, c_i16);
current_a = x_in.extract::<$shift, 3>() + pred;
preds[$shift..$shift + 3].copy_from_slice(pred.as_array());
};
}
process_pixel!(0);
process_pixel!(3);
process_pixel!(6);
process_pixel!(9);
process_pixel!(12);
process_pixel!(15);
process_pixel!(18);
process_pixel!(21);
process_pixel!(24);
process_pixel!(27);
process_pixel!(30);
process_pixel!(33);
process_pixel!(36);
process_pixel!(39);
process_pixel!(42);
process_pixel!(45);
*x_out += Simd::from_array(preds);
current_a
}
pub fn paeth_unfilter_3bpp(current: &mut [u8], prev: &[u8]) {
const BPP: usize = 3;
const STRIDE_BYTES: usize = 48;
let mut a: Simd<u8, BPP> = Default::default(); let mut c: Simd<u8, BPP> = Default::default();
let mut current_iter = current.chunks_exact_mut(STRIDE_BYTES);
let mut previous_iter = prev.chunks_exact(STRIDE_BYTES);
let combined_iter = (&mut current_iter).zip(&mut previous_iter);
for (chunk, prev_chunk) in combined_iter {
let mut x: Simd<u8, STRIDE_BYTES> = Simd::<u8, STRIDE_BYTES>::from_slice(chunk);
let b: Simd<u8, STRIDE_BYTES> = Simd::<u8, STRIDE_BYTES>::from_slice(prev_chunk);
a = process_paeth_chunk_bpp3_s48(a, &b, c, &mut x);
c = b.extract::<{ STRIDE_BYTES - BPP }, BPP>();
x.copy_to_slice(chunk);
}
let mut a_bpp = a.to_array();
let mut c_bpp = c.to_array();
for (chunk, b_bpp) in current_iter
.into_remainder()
.chunks_exact_mut(BPP)
.zip(previous_iter.remainder().chunks_exact(BPP))
{
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_chosen(
a_bpp[0].into(),
b_bpp[0].into(),
c_bpp[0].into(),
)),
chunk[1].wrapping_add(filter_paeth_chosen(
a_bpp[1].into(),
b_bpp[1].into(),
c_bpp[1].into(),
)),
chunk[2].wrapping_add(filter_paeth_chosen(
a_bpp[2].into(),
b_bpp[2].into(),
c_bpp[2].into(),
)),
];
*TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}
#[inline(always)]
fn process_paeth_chunk_bpp4_s64(
mut current_a: Simd<u8, 4>,
b_vec: &Simd<u8, 64>,
c_vec_initial: Simd<u8, 4>,
x_out: &mut Simd<u8, 64>,
) -> Simd<u8, 4> {
let x_in = *x_out;
let mut preds = [0u8; 64];
let mut c_vec = b_vec.shift_elements_right::<4>(0u8);
c_vec.as_mut_array()[0..4].copy_from_slice(c_vec_initial.as_array());
macro_rules! process_pixel {
($shift:expr) => {
let a_i16 = current_a.cast::<i16>();
let b_i16 = b_vec.extract::<$shift, 4>().cast::<i16>();
let c_i16 = c_vec.extract::<$shift, 4>().cast::<i16>();
let pred = paeth_predictor_simd(a_i16, b_i16, c_i16);
current_a = x_in.extract::<$shift, 4>() + pred;
preds[$shift..$shift + 4].copy_from_slice(pred.as_array());
};
}
process_pixel!(0);
process_pixel!(4);
process_pixel!(8);
process_pixel!(12);
process_pixel!(16);
process_pixel!(20);
process_pixel!(24);
process_pixel!(28);
process_pixel!(32);
process_pixel!(36);
process_pixel!(40);
process_pixel!(44);
process_pixel!(48);
process_pixel!(52);
process_pixel!(56);
process_pixel!(60);
*x_out += Simd::from_array(preds);
current_a
}
pub fn paeth_unfilter_4bpp(row: &mut [u8], prev_row: &[u8]) {
const BPP: usize = 4;
const STRIDE_BYTES: usize = 64;
let mut a: Simd<u8, BPP> = Default::default(); let mut c: Simd<u8, BPP> = Default::default();
let chunks = row.len() / STRIDE_BYTES;
let (simd_row, remainder_row) = row.split_at_mut(chunks * STRIDE_BYTES);
let (simd_prev_row, remainder_prev_row) = prev_row.split_at(chunks * STRIDE_BYTES);
let row_iter = simd_row.chunks_exact_mut(STRIDE_BYTES);
let prev_iter = simd_prev_row.chunks_exact(STRIDE_BYTES);
let combined_iter = row_iter.zip(prev_iter);
for (chunk, prev_chunk) in combined_iter {
let mut x: Simd<u8, STRIDE_BYTES> = Simd::<u8, STRIDE_BYTES>::from_slice(chunk);
let b: Simd<u8, STRIDE_BYTES> = Simd::<u8, STRIDE_BYTES>::from_slice(prev_chunk);
a = process_paeth_chunk_bpp4_s64(a, &b, c, &mut x);
c = b.extract::<{ STRIDE_BYTES - BPP }, BPP>();
x.copy_to_slice(chunk);
}
let mut a_bpp = a.to_array();
let mut c_bpp = c.to_array();
for (chunk, b_bpp) in remainder_row
.chunks_exact_mut(BPP)
.zip(remainder_prev_row.chunks_exact(BPP))
{
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_chosen(
a_bpp[0].into(),
b_bpp[0].into(),
c_bpp[0].into(),
)),
chunk[1].wrapping_add(filter_paeth_chosen(
a_bpp[1].into(),
b_bpp[1].into(),
c_bpp[1].into(),
)),
chunk[2].wrapping_add(filter_paeth_chosen(
a_bpp[2].into(),
b_bpp[2].into(),
c_bpp[2].into(),
)),
chunk[3].wrapping_add(filter_paeth_chosen(
a_bpp[3].into(),
b_bpp[3].into(),
c_bpp[3].into(),
)),
];
*TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}