#[cfg(feature = "nightly_f16")]
use core::f16;
use num_traits::Float;
use crate::channels_configuration::FastBlurChannels;
use crate::edge_mode::clamp_edge;
#[cfg(all(target_arch = "aarch64", feature = "neon", feature = "nightly_f16"))]
use crate::neon::{fg_horizontal_pass_neon_f16, fg_vertical_pass_neon_f16};
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
use crate::neon::{
fg_horizontal_pass_neon_f32, fg_horizontal_pass_neon_u8, fg_vertical_pass_neon_f32,
fg_vertical_pass_neon_u8,
};
use crate::primitives::PrimitiveCast;
#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
feature = "sse",
feature = "nightly_f16"
))]
use crate::sse::{fg_horizontal_pass_sse_f16, fg_vertical_pass_sse_f16};
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
use crate::sse::{
fg_horizontal_pass_sse_f32, fg_horizontal_pass_sse_u8, fg_vertical_pass_sse_f32,
fg_vertical_pass_sse_u8,
};
use crate::threading_policy::ThreadingPolicy;
use crate::to_storage::ToStorage;
use crate::unsafe_slice::UnsafeSlice;
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
use crate::wasm32::{fg_horizontal_pass_wasm_u8, fg_vertical_pass_wasm_u8};
use crate::{AnisotropicRadius, BlurError, BlurImageMut, EdgeMode, EdgeMode2D};
const BASE_RADIUS_I64_CUTOFF: u32 = 180;
macro_rules! update_differences_inside {
($dif_r:expr, $buffer_r:expr, $arr_index:expr, $d_arr_index:expr) => {{
let twos: J = 2i32.cast_();
$dif_r += unsafe { *$buffer_r.get_unchecked($arr_index) }
- twos * unsafe { *$buffer_r.get_unchecked($d_arr_index) };
}};
}
macro_rules! update_differences_out {
($dif:expr, $buffer:expr, $arr_index:expr) => {{
let twos: J = 2i32.cast_();
$dif -= twos * unsafe { *$buffer.get_unchecked($arr_index) };
}};
}
macro_rules! update_sum_in {
($bytes:expr, $bytes_offset:expr, $dif:expr, $sum:expr, $buffer:expr, $arr_index:expr) => {{
let v: J = $bytes[$bytes_offset].cast_();
$dif += v;
$sum += $dif;
unsafe {
*$buffer.get_unchecked_mut($arr_index) = v;
}
}};
}
macro_rules! write_out_blurred {
($sum:expr, $weight:expr, $bytes:expr, $bytes_offset:expr) => {{
let sum_f: M = $sum.cast_();
let new_v: T = (sum_f * $weight).to_();
unsafe {
$bytes.write($bytes_offset, new_v);
}
}};
}
macro_rules! impl_generic_call {
($store_type:ty, $channels_type:expr, $edge_mode:expr,
$bytes:expr, $stride:expr, $width:expr, $height:expr,
$radius:expr, $threading_policy:expr) => {
let _dispatch = match $channels_type {
FastBlurChannels::Plane => fast_gaussian_impl::<$store_type, 1>,
FastBlurChannels::Channels3 => fast_gaussian_impl::<$store_type, 3>,
FastBlurChannels::Channels4 => fast_gaussian_impl::<$store_type, 4>,
};
_dispatch(
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy,
$edge_mode,
);
};
}
macro_rules! impl_margin_call {
($store_type:ty, $channels_type:expr, $edge_mode:expr,
$bytes:expr, $stride:expr, $width:expr, $height:expr,
$radius:expr, $threading_policy:expr) => {
impl_generic_call!(
$store_type,
$channels_type,
$edge_mode,
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy
);
};
}
trait InitialValue {
fn get_initial(radius: usize) -> i64;
}
impl InitialValue for f32 {
fn get_initial(_: usize) -> i64 {
0i64
}
}
#[cfg(feature = "nightly_f16")]
impl InitialValue for f16 {
fn get_initial(_: usize) -> i64 {
0i64
}
}
impl InitialValue for f64 {
fn get_initial(_: usize) -> i64 {
0i64
}
}
impl InitialValue for u8 {
fn get_initial(radius: usize) -> i64 {
((radius * radius) >> 1) as i64
}
}
impl InitialValue for u16 {
fn get_initial(radius: usize) -> i64 {
((radius * radius) >> 1) as i64
}
}
fn fg_vertical_pass<T, J, M, const CN: usize>(
bytes: &UnsafeSlice<T>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
edge_mode: EdgeMode,
) where
T: std::ops::AddAssign
+ 'static
+ std::ops::SubAssign
+ Copy
+ Default
+ PrimitiveCast<J>
+ InitialValue,
J: Copy
+ Default
+ std::ops::Mul<Output = J>
+ std::ops::Sub<Output = J>
+ std::ops::Add<Output = J>
+ std::ops::AddAssign
+ std::ops::SubAssign
+ PrimitiveCast<M>,
M: Copy + std::ops::Mul<Output = M> + PrimitiveCast<T> + Float + ToStorage<T>,
i32: PrimitiveCast<J>,
i64: PrimitiveCast<J>,
f64: PrimitiveCast<M>,
{
let zero_j: J = PrimitiveCast::cast_(0i32);
let mut buffer_r = [zero_j; 1024];
let mut buffer_g = [zero_j; 1024];
let mut buffer_b = [zero_j; 1024];
let mut buffer_a = [zero_j; 1024];
let radius_64 = radius as i64;
let height_wide = height as i64;
let initial: J = PrimitiveCast::cast_(T::get_initial(radius as usize));
let weight = PrimitiveCast::cast_(1f64 / (radius as f64 * radius as f64));
for x in start..width.min(end) {
let mut dif_r: J = 0i32.cast_();
let mut sum_r: J = initial;
let mut dif_g: J = 0i32.cast_();
let mut sum_g: J = initial;
let mut dif_b: J = 0i32.cast_();
let mut sum_b: J = initial;
let mut dif_a: J = 0i32.cast_();
let mut sum_a: J = initial;
let current_px = (x * CN as u32) as usize;
let start_y = 0 - 2 * radius as i64;
for y in start_y..height_wide {
if y >= 0 {
let current_y = (y * (stride as i64)) as usize;
let bytes_offset = current_y + current_px;
write_out_blurred!(sum_r, weight, bytes, bytes_offset);
if CN > 1 {
write_out_blurred!(sum_g, weight, bytes, bytes_offset + 1);
}
if CN > 2 {
write_out_blurred!(sum_b, weight, bytes, bytes_offset + 2);
}
if CN == 4 {
write_out_blurred!(sum_a, weight, bytes, bytes_offset + 3);
}
let arr_index = ((y - radius_64) & 1023) as usize;
let d_arr_index = (y & 1023) as usize;
update_differences_inside!(dif_r, buffer_r, arr_index, d_arr_index);
if CN > 1 {
update_differences_inside!(dif_g, buffer_g, arr_index, d_arr_index);
}
if CN > 2 {
update_differences_inside!(dif_b, buffer_b, arr_index, d_arr_index);
}
if CN == 4 {
update_differences_inside!(dif_a, buffer_a, arr_index, d_arr_index);
}
} else if y + radius_64 >= 0 {
let arr_index = (y & 1023) as usize;
update_differences_out!(dif_r, buffer_r, arr_index);
if CN > 1 {
update_differences_out!(dif_g, buffer_g, arr_index);
}
if CN > 2 {
update_differences_out!(dif_b, buffer_b, arr_index);
}
if CN == 4 {
update_differences_out!(dif_a, buffer_a, arr_index);
}
}
let next_row_y =
clamp_edge!(edge_mode, y + radius_64, 0i64, height_wide) * (stride as usize);
let next_row_x = (x * CN as u32) as usize;
let px_idx = next_row_y + next_row_x;
let arr_index = ((y + radius_64) & 1023) as usize;
update_sum_in!(bytes, px_idx, dif_r, sum_r, buffer_r, arr_index);
if CN > 1 {
update_sum_in!(bytes, px_idx + 1, dif_g, sum_g, buffer_g, arr_index);
}
if CN > 2 {
update_sum_in!(bytes, px_idx + 2, dif_b, sum_b, buffer_b, arr_index);
}
if CN == 4 {
update_sum_in!(bytes, px_idx + 3, dif_a, sum_a, buffer_a, arr_index);
}
}
}
}
fn fg_horizontal_pass<T, J, M, const CN: usize>(
bytes: &UnsafeSlice<T>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
edge_mode: EdgeMode,
) where
T: std::ops::AddAssign
+ 'static
+ std::ops::SubAssign
+ Copy
+ Default
+ PrimitiveCast<J>
+ InitialValue,
J: Copy
+ Default
+ std::ops::Mul<Output = J>
+ std::ops::Sub<Output = J>
+ std::ops::Add<Output = J>
+ std::ops::AddAssign
+ std::ops::SubAssign
+ PrimitiveCast<M>,
M: Copy + std::ops::Mul<Output = M> + PrimitiveCast<T> + Float + ToStorage<T>,
i32: PrimitiveCast<J>,
f64: PrimitiveCast<M>,
i64: PrimitiveCast<J>,
{
let mut buffer_r = [0i32.cast_(); 1024];
let mut buffer_g = [0i32.cast_(); 1024];
let mut buffer_b = [0i32.cast_(); 1024];
let mut buffer_a = [0i32.cast_(); 1024];
let radius_64 = radius as i64;
let width_wide = width as i64;
let weight: M = (1f64 / (radius as f64 * radius as f64)).cast_();
let initial: J = T::get_initial(radius as usize).cast_();
for y in start..height.min(end) {
let mut dif_r: J = 0i32.cast_();
let mut sum_r: J = initial;
let mut dif_g: J = 0i32.cast_();
let mut sum_g: J = initial;
let mut dif_b: J = 0i32.cast_();
let mut sum_b: J = initial;
let mut dif_a: J = 0i32.cast_();
let mut sum_a: J = initial;
let current_y = ((y as i64) * (stride as i64)) as usize;
let start_x = 0 - 2 * radius_64;
for x in start_x..(width as i64) {
if x >= 0 {
let current_px = (x * CN as i64) as usize;
let bytes_offset = current_y + current_px;
write_out_blurred!(sum_r, weight, bytes, bytes_offset);
if CN > 1 {
write_out_blurred!(sum_g, weight, bytes, bytes_offset + 1);
}
if CN > 2 {
write_out_blurred!(sum_b, weight, bytes, bytes_offset + 2);
}
if CN == 4 {
write_out_blurred!(sum_a, weight, bytes, bytes_offset + 3);
}
let arr_index = ((x - radius_64) & 1023) as usize;
let d_arr_index = (x & 1023) as usize;
update_differences_inside!(dif_r, buffer_r, arr_index, d_arr_index);
if CN > 1 {
update_differences_inside!(dif_g, buffer_g, arr_index, d_arr_index);
}
if CN > 2 {
update_differences_inside!(dif_b, buffer_b, arr_index, d_arr_index);
}
if CN == 4 {
update_differences_inside!(dif_a, buffer_a, arr_index, d_arr_index);
}
} else if x + radius_64 >= 0 {
let arr_index = (x & 1023) as usize;
update_differences_out!(dif_r, buffer_r, arr_index);
if CN > 1 {
update_differences_out!(dif_g, buffer_g, arr_index);
}
if CN > 2 {
update_differences_out!(dif_b, buffer_b, arr_index);
}
if CN == 4 {
update_differences_out!(dif_a, buffer_a, arr_index);
}
}
let next_row_y = (y as usize) * (stride as usize);
let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide) * CN;
let bytes_offset = next_row_y + next_row_x;
let arr_index = ((x + radius_64) & 1023) as usize;
update_sum_in!(bytes, bytes_offset, dif_r, sum_r, buffer_r, arr_index);
if CN > 1 {
update_sum_in!(bytes, bytes_offset + 1, dif_g, sum_g, buffer_g, arr_index);
}
if CN > 2 {
update_sum_in!(bytes, bytes_offset + 2, dif_b, sum_b, buffer_b, arr_index);
}
if CN == 4 {
update_sum_in!(bytes, bytes_offset + 3, dif_a, sum_a, buffer_a, arr_index);
}
}
}
}
trait FastGaussianDispatchProvider<T> {
fn get_vertical<const CN: usize>(
radius: u32,
) -> fn(
bytes: &UnsafeSlice<T>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
EdgeMode,
);
fn get_horizontal<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<T>, u32, u32, u32, u32, u32, u32, EdgeMode);
}
impl FastGaussianDispatchProvider<u16> for u16 {
fn get_vertical<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<u16>, u32, u32, u32, u32, u32, u32, EdgeMode) {
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
use crate::neon::fg_vertical_pass_neon_u16;
if BASE_RADIUS_I64_CUTOFF > radius {
return fg_vertical_pass_neon_u16::<CN>;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let has_avx = std::arch::is_x86_feature_detected!("avx2");
if has_avx && BASE_RADIUS_I64_CUTOFF > radius {
use crate::avx::fg_vertical_pass_avx_u16;
return fg_vertical_pass_avx_u16::<CN>;
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
let is_sse_available = std::arch::is_x86_feature_detected!("sse4.1");
if is_sse_available && BASE_RADIUS_I64_CUTOFF > radius {
use crate::sse::fg_vertical_pass_sse_u16;
return fg_vertical_pass_sse_u16::<CN>;
}
}
if BASE_RADIUS_I64_CUTOFF > radius {
fg_vertical_pass::<u16, i32, f32, CN>
} else {
fg_vertical_pass::<u16, i64, f64, CN>
}
}
fn get_horizontal<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<u16>, u32, u32, u32, u32, u32, u32, EdgeMode) {
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
use crate::neon::fg_horizontal_pass_neon_u16;
return fg_horizontal_pass_neon_u16::<CN>;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let has_avx = std::arch::is_x86_feature_detected!("avx2");
if has_avx && BASE_RADIUS_I64_CUTOFF > radius {
use crate::avx::fg_horizontal_pass_avx_u16;
return fg_horizontal_pass_avx_u16::<CN>;
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
let is_sse_available = std::arch::is_x86_feature_detected!("sse4.1");
if is_sse_available && BASE_RADIUS_I64_CUTOFF > radius {
use crate::sse::fg_horizontal_pass_sse_u16;
return fg_horizontal_pass_sse_u16::<CN>;
}
}
if BASE_RADIUS_I64_CUTOFF > radius {
fg_horizontal_pass::<u16, i32, f32, CN>
} else {
fg_horizontal_pass::<u16, i64, f64, CN>
}
}
}
impl FastGaussianDispatchProvider<u8> for u8 {
fn get_horizontal<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<u8>, u32, u32, u32, u32, u32, u32, EdgeMode) {
let mut _dispatcher_horizontal: fn(
&UnsafeSlice<u8>,
u32,
u32,
u32,
u32,
u32,
u32,
EdgeMode,
) = if BASE_RADIUS_I64_CUTOFF > radius {
fg_horizontal_pass::<u8, i32, f32, CN>
} else {
fg_horizontal_pass::<u8, i64, f64, CN>
};
#[cfg(all(target_arch = "aarch64", feature = "sve"))]
{
if std::arch::is_aarch64_feature_detected!("sve2") {
use crate::sve::fg_horizontal_pass_neon_u8_sve;
return fg_horizontal_pass_neon_u8_sve::<CN>;
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
_dispatcher_horizontal = fg_horizontal_pass_neon_u8::<CN>;
#[cfg(feature = "rdm")]
{
if std::arch::is_aarch64_feature_detected!("rdm") {
use crate::neon::fg_horizontal_pass_neon_u8_rdm;
_dispatcher_horizontal = fg_horizontal_pass_neon_u8_rdm::<CN>;
}
}
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
_dispatcher_horizontal = fg_horizontal_pass_wasm_u8::<CN>;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let has_avx = std::arch::is_x86_feature_detected!("avx2");
if has_avx && BASE_RADIUS_I64_CUTOFF > radius {
use crate::avx::fg_horizontal_pass_sse_u8;
return fg_horizontal_pass_sse_u8::<CN>;
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
let is_sse_available = std::arch::is_x86_feature_detected!("sse4.1");
if is_sse_available && BASE_RADIUS_I64_CUTOFF > radius {
_dispatcher_horizontal = fg_horizontal_pass_sse_u8::<CN>;
}
}
_dispatcher_horizontal
}
fn get_vertical<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<u8>, u32, u32, u32, u32, u32, u32, EdgeMode) {
let mut _dispatcher_vertical: fn(
bytes: &UnsafeSlice<u8>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
EdgeMode,
) = if BASE_RADIUS_I64_CUTOFF > radius {
fg_vertical_pass::<u8, i32, f32, CN>
} else {
fg_vertical_pass::<u8, i64, f64, CN>
};
#[cfg(all(target_arch = "aarch64", feature = "sve"))]
{
if std::arch::is_aarch64_feature_detected!("sve2") {
use crate::sve::fg_vertical_pass_neon_u8_sve;
return fg_vertical_pass_neon_u8_sve::<CN>;
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
_dispatcher_vertical = fg_vertical_pass_neon_u8::<CN>;
#[cfg(feature = "rdm")]
{
if std::arch::is_aarch64_feature_detected!("rdm") {
use crate::neon::fg_vertical_pass_neon_u8_rdm;
_dispatcher_vertical = fg_vertical_pass_neon_u8_rdm::<CN>;
}
}
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
_dispatcher_vertical = fg_vertical_pass_wasm_u8::<CN>;
}
}
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let has_avx = std::arch::is_x86_feature_detected!("avx2");
if has_avx && BASE_RADIUS_I64_CUTOFF > radius {
use crate::avx::fg_vertical_pass_avx_u8;
return fg_vertical_pass_avx_u8::<CN>;
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
let is_sse_available = std::arch::is_x86_feature_detected!("sse4.1");
if is_sse_available && BASE_RADIUS_I64_CUTOFF > radius {
_dispatcher_vertical = fg_vertical_pass_sse_u8::<CN>;
}
}
_dispatcher_vertical
}
}
impl FastGaussianDispatchProvider<f32> for f32 {
fn get_vertical<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<f32>, u32, u32, u32, u32, u32, u32, EdgeMode) {
let mut _dispatcher_vertical: fn(
bytes: &UnsafeSlice<f32>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
EdgeMode,
) = if BASE_RADIUS_I64_CUTOFF > radius {
fg_vertical_pass::<f32, f32, f32, CN>
} else {
fg_vertical_pass::<f32, f64, f64, CN>
};
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let has_avx = std::arch::is_x86_feature_detected!("avx2");
if has_avx {
use crate::avx::fg_vertical_pass_avx_f32;
return fg_vertical_pass_avx_f32::<CN>;
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
let is_sse_available = std::arch::is_x86_feature_detected!("sse4.1");
if is_sse_available {
_dispatcher_vertical = fg_vertical_pass_sse_f32::<CN>;
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher_vertical = fg_vertical_pass_neon_f32::<CN>;
}
_dispatcher_vertical
}
fn get_horizontal<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<f32>, u32, u32, u32, u32, u32, u32, EdgeMode) {
let mut _dispatcher_horizontal: fn(
&UnsafeSlice<f32>,
u32,
u32,
u32,
u32,
u32,
u32,
EdgeMode,
) = if BASE_RADIUS_I64_CUTOFF > radius {
fg_horizontal_pass::<f32, f32, f32, CN>
} else {
fg_horizontal_pass::<f32, f64, f64, CN>
};
#[cfg(all(target_arch = "x86_64", feature = "avx"))]
{
let has_avx = std::arch::is_x86_feature_detected!("avx2");
if has_avx {
use crate::avx::fg_horizontal_pass_avx_f32;
return fg_horizontal_pass_avx_f32::<CN>;
}
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
let is_sse_available = std::arch::is_x86_feature_detected!("sse4.1");
if is_sse_available {
_dispatcher_horizontal = fg_horizontal_pass_sse_f32::<CN>;
}
}
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher_horizontal = fg_horizontal_pass_neon_f32::<CN>;
}
_dispatcher_horizontal
}
}
#[cfg(feature = "nightly_f16")]
impl FastGaussianDispatchProvider<f16> for f16 {
fn get_vertical<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<f16>, u32, u32, u32, u32, u32, u32, EdgeMode) {
let mut _dispatcher_vertical: fn(
bytes: &UnsafeSlice<f16>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
EdgeMode,
) = if BASE_RADIUS_I64_CUTOFF > radius {
fg_vertical_pass::<f16, f32, f32, CN>
} else {
fg_vertical_pass::<f16, f64, f64, CN>
};
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher_vertical = fg_vertical_pass_neon_f16::<CN>;
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1")
&& std::arch::is_x86_feature_detected!("f16c")
{
_dispatcher_vertical = fg_vertical_pass_sse_f16::<f16, CN>;
}
}
_dispatcher_vertical
}
fn get_horizontal<const CN: usize>(
radius: u32,
) -> fn(&UnsafeSlice<f16>, u32, u32, u32, u32, u32, u32, EdgeMode) {
let mut _dispatcher_horizontal: fn(
&UnsafeSlice<f16>,
u32,
u32,
u32,
u32,
u32,
u32,
EdgeMode,
) = if BASE_RADIUS_I64_CUTOFF > radius {
fg_horizontal_pass::<f16, f32, f32, CN>
} else {
fg_horizontal_pass::<f16, f64, f64, CN>
};
#[cfg(all(target_arch = "aarch64", feature = "neon"))]
{
_dispatcher_horizontal = fg_horizontal_pass_neon_f16::<CN>;
}
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
{
if std::arch::is_x86_feature_detected!("sse4.1")
&& std::arch::is_x86_feature_detected!("f16c")
{
_dispatcher_horizontal = fg_horizontal_pass_sse_f16::<CN>;
}
}
_dispatcher_horizontal
}
}
fn fast_gaussian_impl<
T: Default
+ Send
+ Sync
+ std::ops::AddAssign
+ std::ops::SubAssign
+ Copy
+ PrimitiveCast<i32>
+ PrimitiveCast<i64>
+ PrimitiveCast<f32>
+ PrimitiveCast<f64>
+ InitialValue
+ FastGaussianDispatchProvider<T>,
const CN: usize,
>(
bytes: &mut [T],
stride: u32,
width: u32,
height: u32,
radius: AnisotropicRadius,
threading_policy: ThreadingPolicy,
edge_modes: EdgeMode2D,
) where
f32: PrimitiveCast<T> + ToStorage<T>,
f64: PrimitiveCast<T> + ToStorage<T>,
{
let unsafe_image = UnsafeSlice::new(bytes);
let thread_count = threading_policy.thread_count(width, height) as u32;
let mut _dispatcher_vertical: fn(
bytes: &UnsafeSlice<T>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
EdgeMode,
) = T::get_vertical::<CN>(radius.y_axis);
let mut _dispatcher_horizontal: fn(&UnsafeSlice<T>, u32, u32, u32, u32, u32, u32, EdgeMode) =
T::get_horizontal::<CN>(radius.x_axis);
let pool = novtb::ThreadPool::new(thread_count as usize);
pool.parallel_for(|thread_index| {
let segment_size = width as usize / thread_count as usize;
let start_x = thread_index * segment_size;
let mut end_x = (thread_index + 1) * segment_size;
if thread_index == thread_count as usize - 1 {
end_x = width as usize;
}
_dispatcher_vertical(
&unsafe_image,
stride,
width,
height,
radius.y_axis,
start_x as u32,
end_x as u32,
edge_modes.vertical,
);
});
pool.parallel_for(|thread_index| {
let segment_size = height / thread_count;
let start_y = thread_index as u32 * segment_size;
let mut end_y = (thread_index as u32 + 1) * segment_size;
if thread_index as u32 == thread_count - 1 {
end_y = height;
}
_dispatcher_horizontal(
&unsafe_image,
stride,
width,
height,
radius.x_axis,
start_y,
end_y,
edge_modes.horizontal,
);
});
}
pub fn fast_gaussian(
image: &mut BlurImageMut<u8>,
radius: AnisotropicRadius,
threading_policy: ThreadingPolicy,
edge_modes: EdgeMode2D,
) -> Result<(), BlurError> {
image.check_layout(None)?;
let radius = radius.clamp(1, 319);
let stride = image.row_stride();
let width = image.width;
let height = image.height;
let data = image.data.borrow_mut();
impl_margin_call!(
u8,
image.channels,
edge_modes,
data,
stride,
width,
height,
radius,
threading_policy
);
Ok(())
}
pub fn fast_gaussian_u16(
image: &mut BlurImageMut<u16>,
radius: AnisotropicRadius,
threading_policy: ThreadingPolicy,
edge_modes: EdgeMode2D,
) -> Result<(), BlurError> {
image.check_layout(None)?;
let stride = image.row_stride();
let width = image.width;
let height = image.height;
let channels = image.channels;
let data = image.data.borrow_mut();
let radius = radius.clamp(1, 255);
impl_margin_call!(
u16,
channels,
edge_modes,
data,
stride,
width,
height,
radius,
threading_policy
);
Ok(())
}
pub fn fast_gaussian_f32(
image: &mut BlurImageMut<f32>,
radius: AnisotropicRadius,
threading_policy: ThreadingPolicy,
edge_modes: EdgeMode2D,
) -> Result<(), BlurError> {
image.check_layout(None)?;
let stride = image.row_stride();
let width = image.width;
let height = image.height;
let channels = image.channels;
let data = image.data.borrow_mut();
let radius = AnisotropicRadius::create(radius.x_axis.max(1), radius.y_axis.max(1));
impl_margin_call!(
f32,
channels,
edge_modes,
data,
stride,
width,
height,
radius,
threading_policy
);
Ok(())
}
#[cfg(feature = "nightly_f16")]
#[cfg_attr(docsrs, doc(cfg(feature = "nightly_f16")))]
pub fn fast_gaussian_f16(
image: &mut BlurImageMut<f16>,
radius: AnisotropicRadius,
threading_policy: ThreadingPolicy,
edge_modes: EdgeMode2D,
) -> Result<(), BlurError> {
image.check_layout(None)?;
let stride = image.row_stride();
let width = image.width;
let height = image.height;
let channels = image.channels;
let data = image.data.borrow_mut();
let radius = AnisotropicRadius::create(radius.x_axis.max(1), radius.y_axis.max(1));
impl_margin_call!(
f16,
channels,
edge_modes,
data,
stride,
width,
height,
radius,
threading_policy
);
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn lcg_rand(state: &mut u64) -> u64 {
*state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
*state
}
fn rand_dimension(state: &mut u64, min: usize, max: usize) -> usize {
min + (lcg_rand(state) as usize % (max - min + 1))
}
#[test]
fn test_fast_gaussian_u8_q_k5() {
let mut rng = 0xdeadbeef_u64
^ std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.subsec_nanos() as u64;
let width = rand_dimension(&mut rng, 16, 512);
let height = rand_dimension(&mut rng, 16, 512);
let mut dst = vec![126; width * height * 3];
let mut dst_image = BlurImageMut::borrow(
&mut dst,
width as u32,
height as u32,
FastBlurChannels::Channels3,
);
fast_gaussian(
&mut dst_image,
AnisotropicRadius::new(5),
ThreadingPolicy::Single,
EdgeMode2D::new(EdgeMode::Clamp),
)
.unwrap();
for (i, &cn) in dst.iter().enumerate() {
let diff = (cn as i32 - 126).abs();
assert!(
diff <= 3,
"Diff expected to be less than 3, but it was {diff} at {i}"
);
}
}
#[test]
fn test_fast_gaussian_u16_fp_k25() {
let mut rng = 0xdeadbeef_u64
^ std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.subsec_nanos() as u64;
let width = rand_dimension(&mut rng, 16, 512);
let height = rand_dimension(&mut rng, 16, 512);
let mut dst = vec![17234u16; width * height * 3];
let mut dst_image = BlurImageMut::borrow(
&mut dst,
width as u32,
height as u32,
FastBlurChannels::Channels3,
);
fast_gaussian_u16(
&mut dst_image,
AnisotropicRadius::new(5),
ThreadingPolicy::Single,
EdgeMode2D::new(EdgeMode::Clamp),
)
.unwrap();
for &cn in dst.iter() {
let diff = (cn as i32 - 17234i32).abs();
assert!(
diff <= 14,
"Diff expected to be less than 14, but it was {diff}"
);
}
}
#[test]
fn test_fast_gaussian_f32_k25() {
let mut rng = 0xdeadbeef_u64
^ std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.subsec_nanos() as u64;
let width = rand_dimension(&mut rng, 16, 512);
let height = rand_dimension(&mut rng, 16, 512);
let mut dst = vec![0.432; width * height * 3];
let mut dst_image = BlurImageMut::borrow(
&mut dst,
width as u32,
height as u32,
FastBlurChannels::Channels3,
);
fast_gaussian_f32(
&mut dst_image,
AnisotropicRadius::new(25),
ThreadingPolicy::Single,
EdgeMode2D::new(EdgeMode::Clamp),
)
.unwrap();
for &cn in dst.iter() {
let diff = (cn - 0.432).abs();
assert!(
diff <= 1e-4,
"Diff expected to be less than 1e-4, but it was {diff}"
);
}
}
}