#[cfg(all(target_arch = "aarch64", feature = "neon"))]
use crate::stackblur::neon::{
HorizontalNeonStackBlurPassFloat16, VerticalNeonStackBlurPassFloat16,
};
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
use crate::stackblur::sse::{HorizontalSseStackBlurPassFloat16, VerticalSseStackBlurPassFloat16};
use crate::stackblur::{HorizontalStackBlurPass, StackBlurWorkingPass, VerticalStackBlurPass};
use crate::unsafe_slice::UnsafeSlice;
use crate::{AnisotropicRadius, BlurError, BlurImageMut, FastBlurChannels, ThreadingPolicy};
use core::f16;
fn stack_blur_worker_horizontal(
slice: &UnsafeSlice<f16>,
stride: u32,
width: u32,
height: u32,
radius: u32,
channels: FastBlurChannels,
thread: usize,
thread_count: usize,
) {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
let _is_f16c_available = std::arch::is_x86_feature_detected!("f16c");
match channels {
FastBlurChannels::Plane => {
let mut _executor: Box<dyn StackBlurWorkingPass<f16, 1>> =
Box::new(HorizontalStackBlurPass::<f16, f32, f32, 1>::default());
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
#[cfg(feature = "sse")]
if std::arch::is_x86_feature_detected!("sse4.1")
&& std::arch::is_x86_feature_detected!("f16c")
{
_executor = Box::new(HorizontalSseStackBlurPassFloat16::<1>::default());
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_executor = Box::new(HorizontalNeonStackBlurPassFloat16::<1>::default());
}
_executor.pass(slice, stride, width, height, radius, thread, thread_count);
}
FastBlurChannels::Channels3 => {
let mut _executor: Box<dyn StackBlurWorkingPass<f16, 3>> =
Box::new(HorizontalStackBlurPass::<f16, f32, f32, 3>::default());
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
#[cfg(feature = "sse")]
if std::arch::is_x86_feature_detected!("sse4.1")
&& std::arch::is_x86_feature_detected!("f16c")
{
_executor = Box::new(HorizontalSseStackBlurPassFloat16::<3>::default());
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_executor = Box::new(HorizontalNeonStackBlurPassFloat16::<3>::default());
}
_executor.pass(slice, stride, width, height, radius, thread, thread_count);
}
FastBlurChannels::Channels4 => {
let mut _executor: Box<dyn StackBlurWorkingPass<f16, 4>> =
Box::new(HorizontalStackBlurPass::<f16, f32, f32, 4>::default());
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
#[cfg(feature = "sse")]
if std::arch::is_x86_feature_detected!("sse4.1")
&& std::arch::is_x86_feature_detected!("f16c")
{
_executor = Box::new(HorizontalSseStackBlurPassFloat16::<4>::default());
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_executor = Box::new(HorizontalNeonStackBlurPassFloat16::<4>::default());
}
_executor.pass(slice, stride, width, height, radius, thread, thread_count);
}
}
}
fn stack_blur_worker_vertical(
slice: &UnsafeSlice<f16>,
stride: u32,
width: u32,
height: u32,
radius: u32,
channels: FastBlurChannels,
thread: usize,
thread_count: usize,
) {
match channels {
FastBlurChannels::Plane => {
let mut _executor: Box<dyn StackBlurWorkingPass<f16, 1>> =
Box::new(VerticalStackBlurPass::<f16, f32, f32, 1>::default());
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
#[cfg(feature = "sse")]
if std::arch::is_x86_feature_detected!("sse4.1")
&& std::arch::is_x86_feature_detected!("f16c")
{
_executor = Box::new(VerticalSseStackBlurPassFloat16::<1>::default());
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_executor = Box::new(VerticalNeonStackBlurPassFloat16::<1>::default());
}
_executor.pass(slice, stride, width, height, radius, thread, thread_count);
}
FastBlurChannels::Channels3 => {
let mut _executor: Box<dyn StackBlurWorkingPass<f16, 3>> =
Box::new(VerticalStackBlurPass::<f16, f32, f32, 3>::default());
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
#[cfg(feature = "sse")]
if std::arch::is_x86_feature_detected!("sse4.1")
&& std::arch::is_x86_feature_detected!("f16c")
{
_executor = Box::new(VerticalSseStackBlurPassFloat16::<3>::default());
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_executor = Box::new(VerticalNeonStackBlurPassFloat16::<3>::default());
}
_executor.pass(slice, stride, width, height, radius, thread, thread_count);
}
FastBlurChannels::Channels4 => {
let mut _executor: Box<dyn StackBlurWorkingPass<f16, 4>> =
Box::new(VerticalStackBlurPass::<f16, f32, f32, 4>::default());
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
#[cfg(feature = "sse")]
if std::arch::is_x86_feature_detected!("sse4.1")
&& std::arch::is_x86_feature_detected!("f16c")
{
_executor = Box::new(VerticalSseStackBlurPassFloat16::<4>::default());
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_executor = Box::new(VerticalNeonStackBlurPassFloat16::<4>::default());
}
_executor.pass(slice, stride, width, height, radius, thread, thread_count);
}
}
}
#[cfg(feature = "nightly_f16")]
#[cfg_attr(docsrs, doc(cfg(feature = "nightly_f16")))]
pub fn stack_blur_f16(
image: &mut BlurImageMut<f16>,
radius: AnisotropicRadius,
threading_policy: ThreadingPolicy,
) -> Result<(), BlurError> {
image.check_layout(None)?;
let radius = radius.clamp(1, 1449);
let stride = image.row_stride();
let width = image.width;
let height = image.height;
let channels = image.channels;
let thread_count = threading_policy.thread_count(width, height) as u32;
if thread_count == 1 {
let slice = UnsafeSlice::new(image.data.borrow_mut());
stack_blur_worker_horizontal(&slice, stride, width, height, radius.x_axis, channels, 0, 1);
stack_blur_worker_vertical(&slice, stride, width, height, radius.y_axis, channels, 0, 1);
return Ok(());
}
let pool = novtb::ThreadPool::new(thread_count as usize);
let slice = UnsafeSlice::new(image.data.borrow_mut());
pool.parallel_for(|thread_index| {
stack_blur_worker_horizontal(
&slice,
stride,
width,
height,
radius.x_axis,
channels,
thread_index,
thread_count as usize,
);
});
pool.parallel_for(|thread_index| {
stack_blur_worker_vertical(
&slice,
stride,
width,
height,
radius.y_axis,
channels,
thread_index,
thread_count as usize,
);
});
Ok(())
}