#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::neon::{
fast_gaussian_next_horizontal_pass_neon_f32, fast_gaussian_next_horizontal_pass_neon_u8,
fast_gaussian_next_vertical_pass_neon_f32, fast_gaussian_next_vertical_pass_neon_u8,
};
use crate::reflect_index;
#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "sse4.1"
))]
use crate::sse::{
fast_gaussian_next_horizontal_pass_sse_u8, fast_gaussian_next_vertical_pass_sse_u8,
};
use crate::to_storage::ToStorage;
use crate::unsafe_slice::UnsafeSlice;
use crate::{clamp_edge, reflect_101, EdgeMode, FastBlurChannels, ThreadingPolicy};
use colorutils_rs::linear_to_planar::linear_to_plane;
use colorutils_rs::planar_to_linear::plane_to_linear;
use colorutils_rs::{
linear_to_rgb, linear_to_rgba, rgb_to_linear, rgba_to_linear, TransferFunction,
};
use num_traits::{AsPrimitive, Float, FromPrimitive};
use std::mem::size_of;
const BASE_RADIUS_I64_CUTOFF: u32 = 125;
macro_rules! impl_generic_call {
($store_type:ty, $channels_type:expr, $edge_mode:expr, $bytes:expr, $stride:expr, $width:expr, $height:expr, $radius:expr, $threading_policy:expr) => {
match $channels_type {
FastBlurChannels::Plane => {
fast_gaussian_next_impl::<$store_type, 1, $edge_mode>(
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy,
);
}
FastBlurChannels::Channels3 => {
fast_gaussian_next_impl::<$store_type, 3, $edge_mode>(
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy,
);
}
FastBlurChannels::Channels4 => {
fast_gaussian_next_impl::<$store_type, 4, $edge_mode>(
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy,
);
}
}
};
}
macro_rules! impl_margin_call {
($store_type:ty, $channels_type:expr, $edge_mode:expr,
$bytes:expr, $stride:expr, $width:expr, $height:expr,
$radius:expr, $threading_policy:expr) => {
match $edge_mode {
EdgeMode::Clamp => {
impl_generic_call!(
$store_type,
$channels_type,
{ EdgeMode::Clamp as usize },
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy
);
}
EdgeMode::KernelClip => {
panic!("Kernel clip is supported only in gaussian")
}
EdgeMode::Wrap => {
impl_generic_call!(
$store_type,
$channels_type,
{ EdgeMode::Wrap as usize },
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy
);
}
EdgeMode::Reflect => {
impl_generic_call!(
$store_type,
$channels_type,
{ EdgeMode::Reflect as usize },
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy
);
}
EdgeMode::Reflect101 => {
impl_generic_call!(
$store_type,
$channels_type,
{ EdgeMode::Reflect101 as usize },
$bytes,
$stride,
$width,
$height,
$radius,
$threading_policy
);
}
}
};
}
macro_rules! write_out_blurred {
($sum:expr, $weight:expr, $bytes:expr, $bytes_offset:expr) => {{
let sum_f: M = $sum.as_();
let new_v: T = (sum_f * $weight).to_();
unsafe {
$bytes.write($bytes_offset, new_v);
}
}};
}
macro_rules! update_differences_inside {
($dif:expr, $buffer:expr, $d_idx:expr, $d_idx_1:expr, $d_idx_2:expr) => {{
let threes = J::from_i32(3i32).unwrap();
$dif += threes
* (unsafe { *$buffer.get_unchecked($d_idx) }
- unsafe { *$buffer.get_unchecked($d_idx_1) })
- unsafe { *$buffer.get_unchecked($d_idx_2) };
}};
}
macro_rules! update_differences_one_rad {
($dif:expr, $buffer:expr, $d_idx:expr, $d_idx_1:expr) => {{
let threes = J::from_i32(3i32).unwrap();
$dif += threes
* (unsafe { *$buffer.get_unchecked($d_idx) }
- unsafe { *$buffer.get_unchecked($d_idx_1) });
}};
}
macro_rules! update_differences_two_rad {
($dif:expr, $buffer:expr, $d_idx:expr) => {{
let threes = J::from_i32(3i32).unwrap();
$dif -= threes * unsafe { *$buffer.get_unchecked($d_idx) };
}};
}
macro_rules! update_sum_in {
($bytes:expr, $bytes_offset:expr, $dif:expr, $der:expr, $sum:expr, $buffer:expr, $arr_index:expr) => {{
let v: J = $bytes[$bytes_offset].as_();
$dif += v;
$der += $dif;
$sum += $der;
unsafe {
*$buffer.get_unchecked_mut($arr_index) = v;
}
}};
}
fn fast_gaussian_next_vertical_pass<
T: FromPrimitive + Default,
J,
M,
const CHANNEL_CONFIGURATION: usize,
const EDGE_MODE: usize,
>(
bytes: &UnsafeSlice<T>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
) where
T: std::ops::AddAssign
+ 'static
+ std::ops::SubAssign
+ Copy
+ FromPrimitive
+ Default
+ AsPrimitive<J>,
J: Copy
+ FromPrimitive
+ Default
+ std::ops::Mul<Output = J>
+ std::ops::Sub<Output = J>
+ std::ops::Add<Output = J>
+ std::ops::AddAssign
+ std::ops::SubAssign
+ AsPrimitive<M>,
M: Copy + FromPrimitive + std::ops::Mul<Output = M> + AsPrimitive<T> + Float + ToStorage<T>,
i32: AsPrimitive<J>,
{
let edge_mode: EdgeMode = EDGE_MODE.into();
let mut buffer_r: [J; 1024] = [0i32.as_(); 1024];
let mut buffer_g: [J; 1024] = [0i32.as_(); 1024];
let mut buffer_b: [J; 1024] = [0i32.as_(); 1024];
let mut buffer_a: [J; 1024] = [0i32.as_(); 1024];
let radius_64 = radius as i64;
let height_wide = height as i64;
let weight =
M::from_f64(1.0f64 / ((radius as f64) * (radius as f64) * (radius as f64))).unwrap();
for x in start..std::cmp::min(width, end) {
let mut dif_r: J = 0i32.as_();
let mut der_r: J = 0i32.as_();
let mut sum_r: J = 0i32.as_();
let mut dif_g: J = 0i32.as_();
let mut der_g: J = 0i32.as_();
let mut sum_g: J = 0i32.as_();
let mut dif_b: J = 0i32.as_();
let mut der_b: J = 0i32.as_();
let mut sum_b: J = 0i32.as_();
let mut dif_a: J = 0i32.as_();
let mut der_a: J = 0i32.as_();
let mut sum_a: J = 0i32.as_();
let current_px = (x * CHANNEL_CONFIGURATION as u32) as usize;
let start_y = 0 - 3 * radius as i64;
for y in start_y..height_wide {
let current_y = (y * (stride as i64)) as usize;
if y >= 0 {
let bytes_offset = current_y + current_px;
write_out_blurred!(sum_r, weight, bytes, bytes_offset);
if CHANNEL_CONFIGURATION > 1 {
write_out_blurred!(sum_g, weight, bytes, bytes_offset + 1);
}
if CHANNEL_CONFIGURATION > 2 {
write_out_blurred!(sum_b, weight, bytes, bytes_offset + 2);
}
if CHANNEL_CONFIGURATION == 4 {
write_out_blurred!(sum_a, weight, bytes, bytes_offset + 3);
}
let d_idx_1 = ((y + radius_64) & 1023) as usize;
let d_idx_2 = ((y - radius_64) & 1023) as usize;
let d_idx = (y & 1023) as usize;
update_differences_inside!(dif_r, buffer_r, d_idx, d_idx_1, d_idx_2);
if CHANNEL_CONFIGURATION > 1 {
update_differences_inside!(dif_g, buffer_g, d_idx, d_idx_1, d_idx_2);
}
if CHANNEL_CONFIGURATION > 2 {
update_differences_inside!(dif_b, buffer_b, d_idx, d_idx_1, d_idx_2);
}
if CHANNEL_CONFIGURATION == 4 {
update_differences_inside!(dif_a, buffer_a, d_idx, d_idx_1, d_idx_2);
}
} else if y + radius_64 >= 0 {
let arr_index = (y & 1023) as usize;
let arr_index_1 = ((y + radius_64) & 1023) as usize;
update_differences_one_rad!(dif_r, buffer_r, arr_index, arr_index_1);
if CHANNEL_CONFIGURATION > 1 {
update_differences_one_rad!(dif_g, buffer_g, arr_index, arr_index_1);
}
if CHANNEL_CONFIGURATION > 2 {
update_differences_one_rad!(dif_b, buffer_b, arr_index, arr_index_1);
}
if CHANNEL_CONFIGURATION == 4 {
update_differences_one_rad!(dif_a, buffer_a, arr_index, arr_index_1);
}
} else if y + 2 * radius_64 >= 0 {
let arr_index = ((y + radius_64) & 1023) as usize;
update_differences_two_rad!(dif_r, buffer_r, arr_index);
if CHANNEL_CONFIGURATION > 1 {
update_differences_two_rad!(dif_g, buffer_g, arr_index);
}
if CHANNEL_CONFIGURATION > 2 {
update_differences_two_rad!(dif_b, buffer_b, arr_index);
}
if CHANNEL_CONFIGURATION == 4 {
update_differences_two_rad!(dif_a, buffer_a, arr_index);
}
}
let next_row_y = clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1)
* (stride as usize);
let next_row_x = (x * CHANNEL_CONFIGURATION as u32) as usize;
let px_idx = next_row_y + next_row_x;
let arr_index = ((y + 2 * radius_64) & 1023) as usize;
update_sum_in!(bytes, px_idx, dif_r, der_r, sum_r, buffer_r, arr_index);
if CHANNEL_CONFIGURATION > 1 {
update_sum_in!(bytes, px_idx + 1, dif_g, der_g, sum_g, buffer_g, arr_index);
}
if CHANNEL_CONFIGURATION > 2 {
update_sum_in!(bytes, px_idx + 2, dif_b, der_b, sum_b, buffer_b, arr_index);
}
if CHANNEL_CONFIGURATION == 4 {
update_sum_in!(bytes, px_idx + 3, dif_a, der_a, sum_a, buffer_a, arr_index);
}
}
}
}
fn fast_gaussian_next_horizontal_pass<
T: FromPrimitive + Default + Send + Sync,
J,
M,
const CHANNEL_CONFIGURATION: usize,
const EDGE_MODE: usize,
>(
bytes: &UnsafeSlice<T>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
) where
T: std::ops::AddAssign
+ 'static
+ std::ops::SubAssign
+ Copy
+ FromPrimitive
+ Default
+ AsPrimitive<J>,
J: Copy
+ FromPrimitive
+ Default
+ std::ops::Mul<Output = J>
+ std::ops::Sub<Output = J>
+ std::ops::Add<Output = J>
+ std::ops::AddAssign
+ std::ops::SubAssign
+ AsPrimitive<M>,
M: Copy + FromPrimitive + std::ops::Mul<Output = M> + AsPrimitive<T> + Float + ToStorage<T>,
f32: AsPrimitive<T>,
i32: AsPrimitive<J>,
{
let edge_mode: EdgeMode = EDGE_MODE.into();
let mut buffer_r: [J; 1024] = [0i32.as_(); 1024];
let mut buffer_g: [J; 1024] = [0i32.as_(); 1024];
let mut buffer_b: [J; 1024] = [0i32.as_(); 1024];
let mut buffer_a: [J; 1024] = [0i32.as_(); 1024];
let radius_64 = radius as i64;
let width_wide = width as i64;
let weight =
M::from_f64(1.0f64 / ((radius as f64) * (radius as f64) * (radius as f64))).unwrap();
for y in start..std::cmp::min(height, end) {
let mut dif_r: J = 0i32.as_();
let mut der_r: J = 0i32.as_();
let mut sum_r: J = 0i32.as_();
let mut dif_g: J = 0i32.as_();
let mut der_g: J = 0i32.as_();
let mut sum_g: J = 0i32.as_();
let mut dif_b: J = 0i32.as_();
let mut der_b: J = 0i32.as_();
let mut sum_b: J = 0i32.as_();
let mut dif_a: J = 0i32.as_();
let mut der_a: J = 0i32.as_();
let mut sum_a: J = 0i32.as_();
let current_y = ((y as i64) * (stride as i64)) as usize;
for x in (0 - 3 * radius_64)..(width as i64) {
if x >= 0 {
let current_px = x as usize * CHANNEL_CONFIGURATION;
let bytes_offset = current_y + current_px;
write_out_blurred!(sum_r, weight, bytes, bytes_offset);
if CHANNEL_CONFIGURATION > 1 {
write_out_blurred!(sum_g, weight, bytes, bytes_offset + 1);
}
if CHANNEL_CONFIGURATION > 2 {
write_out_blurred!(sum_b, weight, bytes, bytes_offset + 2);
}
if CHANNEL_CONFIGURATION == 4 {
write_out_blurred!(sum_a, weight, bytes, bytes_offset + 3);
}
let d_idx_1 = ((x + radius_64) & 1023) as usize;
let d_idx_2 = ((x - radius_64) & 1023) as usize;
let d_idx = (x & 1023) as usize;
update_differences_inside!(dif_r, buffer_r, d_idx, d_idx_1, d_idx_2);
if CHANNEL_CONFIGURATION > 1 {
update_differences_inside!(dif_g, buffer_g, d_idx, d_idx_1, d_idx_2);
}
if CHANNEL_CONFIGURATION > 2 {
update_differences_inside!(dif_b, buffer_b, d_idx, d_idx_1, d_idx_2);
}
if CHANNEL_CONFIGURATION == 4 {
update_differences_inside!(dif_a, buffer_a, d_idx, d_idx_1, d_idx_2);
}
} else if x + radius_64 >= 0 {
let arr_index = (x & 1023) as usize;
let arr_index_1 = ((x + radius_64) & 1023) as usize;
update_differences_one_rad!(dif_r, buffer_r, arr_index, arr_index_1);
if CHANNEL_CONFIGURATION > 1 {
update_differences_one_rad!(dif_g, buffer_g, arr_index, arr_index_1);
}
if CHANNEL_CONFIGURATION > 2 {
update_differences_one_rad!(dif_b, buffer_b, arr_index, arr_index_1);
}
if CHANNEL_CONFIGURATION == 4 {
update_differences_one_rad!(dif_a, buffer_a, arr_index, arr_index_1);
}
} else if x + 2 * radius_64 >= 0 {
let arr_index = ((x + radius_64) & 1023) as usize;
update_differences_two_rad!(dif_r, buffer_r, arr_index);
if CHANNEL_CONFIGURATION > 1 {
update_differences_two_rad!(dif_g, buffer_g, arr_index);
}
if CHANNEL_CONFIGURATION > 2 {
update_differences_two_rad!(dif_b, buffer_b, arr_index);
}
if CHANNEL_CONFIGURATION == 4 {
update_differences_two_rad!(dif_a, buffer_a, arr_index);
}
}
let next_row_y = (y as usize) * (stride as usize);
let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1)
* CHANNEL_CONFIGURATION;
let px_off = next_row_y + next_row_x;
let arr_index = ((x + 2 * radius_64) & 1023) as usize;
update_sum_in!(bytes, px_off, dif_r, der_r, sum_r, buffer_r, arr_index);
if CHANNEL_CONFIGURATION > 1 {
update_sum_in!(bytes, px_off + 1, dif_g, der_g, sum_g, buffer_g, arr_index);
}
if CHANNEL_CONFIGURATION > 2 {
update_sum_in!(bytes, px_off + 2, dif_b, der_b, sum_b, buffer_b, arr_index);
}
if CHANNEL_CONFIGURATION == 4 {
update_sum_in!(bytes, px_off + 3, dif_a, der_a, sum_a, buffer_a, arr_index);
}
}
}
}
fn fast_gaussian_next_impl<
T: FromPrimitive + Default + Send + Sync,
const CHANNEL_CONFIGURATION: usize,
const EDGE_MODE: usize,
>(
bytes: &mut [T],
stride: u32,
width: u32,
height: u32,
radius: u32,
threading_policy: ThreadingPolicy,
) where
T: std::ops::AddAssign
+ std::ops::SubAssign
+ Copy
+ AsPrimitive<f32>
+ AsPrimitive<f64>
+ AsPrimitive<i64>
+ AsPrimitive<i32>,
i64: AsPrimitive<T>,
f32: AsPrimitive<T> + ToStorage<T>,
f64: AsPrimitive<T> + ToStorage<T>,
{
let mut _dispatcher_vertical: fn(
bytes: &UnsafeSlice<T>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
) = if BASE_RADIUS_I64_CUTOFF > radius {
fast_gaussian_next_vertical_pass::<T, i32, f32, CHANNEL_CONFIGURATION, EDGE_MODE>
} else {
fast_gaussian_next_vertical_pass::<T, i64, f64, CHANNEL_CONFIGURATION, EDGE_MODE>
};
let mut _dispatcher_horizontal: fn(
bytes: &UnsafeSlice<T>,
stride: u32,
width: u32,
height: u32,
radius: u32,
start: u32,
end: u32,
) = if BASE_RADIUS_I64_CUTOFF > radius {
fast_gaussian_next_horizontal_pass::<T, i32, f32, CHANNEL_CONFIGURATION, EDGE_MODE>
} else {
fast_gaussian_next_horizontal_pass::<T, i64, f64, CHANNEL_CONFIGURATION, EDGE_MODE>
};
if std::any::type_name::<T>() == "f32"
|| std::any::type_name::<T>() == "f16"
|| std::any::type_name::<T>() == "half::f16"
{
_dispatcher_vertical = if BASE_RADIUS_I64_CUTOFF > radius {
fast_gaussian_next_vertical_pass::<T, f32, f32, CHANNEL_CONFIGURATION, EDGE_MODE>
} else {
fast_gaussian_next_vertical_pass::<T, f64, f64, CHANNEL_CONFIGURATION, EDGE_MODE>
};
_dispatcher_horizontal = if BASE_RADIUS_I64_CUTOFF > radius {
fast_gaussian_next_horizontal_pass::<T, f32, f32, CHANNEL_CONFIGURATION, EDGE_MODE>
} else {
fast_gaussian_next_horizontal_pass::<T, f64, f64, CHANNEL_CONFIGURATION, EDGE_MODE>
};
}
if CHANNEL_CONFIGURATION >= 3 {
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
if std::any::type_name::<T>() == "u8" {
_dispatcher_vertical = fast_gaussian_next_vertical_pass_neon_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_neon_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
} else if std::any::type_name::<T>() == "f32" {
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_neon_f32::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_vertical = fast_gaussian_next_vertical_pass_neon_f32::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
}
}
}
#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "sse4.1"
))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
if std::any::type_name::<T>() == "u8" {
_dispatcher_vertical = fast_gaussian_next_vertical_pass_sse_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_sse_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
}
}
}
}
let thread_count = threading_policy.get_threads_count(width, height) as u32;
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(thread_count as usize)
.build()
.unwrap();
let unsafe_image = UnsafeSlice::new(bytes);
pool.scope(|scope| {
let segment_size = width / thread_count;
for i in 0..thread_count {
let start_x = i * segment_size;
let mut end_x = (i + 1) * segment_size;
if i == thread_count - 1 {
end_x = width;
}
scope.spawn(move |_| {
_dispatcher_vertical(&unsafe_image, stride, width, height, radius, start_x, end_x);
});
}
});
pool.scope(|scope| {
let segment_size = height / thread_count;
for i in 0..thread_count {
let start_y = i * segment_size;
let mut end_y = (i + 1) * segment_size;
if i == thread_count - 1 {
end_y = height;
}
scope.spawn(move |_| {
_dispatcher_horizontal(
&unsafe_image,
stride,
width,
height,
radius,
start_y,
end_y,
);
});
}
});
}
pub fn fast_gaussian_next(
bytes: &mut [u8],
stride: u32,
width: u32,
height: u32,
radius: u32,
channels: FastBlurChannels,
threading_policy: ThreadingPolicy,
edge_mode: EdgeMode,
) {
let radius = std::cmp::min(radius, 280);
impl_margin_call!(
u8,
channels,
edge_mode,
bytes,
stride,
width,
height,
radius,
threading_policy
);
}
pub fn fast_gaussian_next_u16(
bytes: &mut [u16],
stride: u32,
width: u32,
height: u32,
radius: u32,
channels: FastBlurChannels,
threading_policy: ThreadingPolicy,
edge_mode: EdgeMode,
) {
let acq_radius = std::cmp::min(radius, 152);
impl_margin_call!(
u16,
channels,
edge_mode,
bytes,
stride,
width,
height,
acq_radius,
threading_policy
);
}
pub fn fast_gaussian_next_f32(
bytes: &mut [f32],
width: u32,
height: u32,
radius: u32,
channels: FastBlurChannels,
threading_policy: ThreadingPolicy,
edge_mode: EdgeMode,
) {
impl_margin_call!(
f32,
channels,
edge_mode,
bytes,
width * channels.get_channels() as u32,
width,
height,
radius,
threading_policy
);
}
pub fn fast_gaussian_next_f16(
bytes: &mut [u16],
width: u32,
height: u32,
radius: u32,
channels: FastBlurChannels,
threading_policy: ThreadingPolicy,
edge_mode: EdgeMode,
) {
impl_margin_call!(
half::f16,
channels,
edge_mode,
unsafe { std::mem::transmute(bytes) },
width * channels.get_channels() as u32,
width,
height,
radius,
threading_policy
);
}
pub fn fast_gaussian_next_in_linear(
in_place: &mut [u8],
stride: u32,
width: u32,
height: u32,
radius: u32,
channels: FastBlurChannels,
threading_policy: ThreadingPolicy,
transfer_function: TransferFunction,
edge_mode: EdgeMode,
) {
let mut linear_data: Vec<f32> =
vec![0f32; width as usize * height as usize * channels.get_channels()];
let forward_transformer = match channels {
FastBlurChannels::Plane => plane_to_linear,
FastBlurChannels::Channels3 => rgb_to_linear,
FastBlurChannels::Channels4 => rgba_to_linear,
};
let inverse_transformer = match channels {
FastBlurChannels::Plane => linear_to_plane,
FastBlurChannels::Channels3 => linear_to_rgb,
FastBlurChannels::Channels4 => linear_to_rgba,
};
forward_transformer(
&in_place,
stride,
&mut linear_data,
width * size_of::<f32>() as u32 * channels.get_channels() as u32,
width,
height,
transfer_function,
);
fast_gaussian_next_f32(
&mut linear_data,
width,
height,
radius,
channels,
threading_policy,
edge_mode,
);
inverse_transformer(
&linear_data,
width * size_of::<f32>() as u32 * channels.get_channels() as u32,
in_place,
stride,
width,
height,
transfer_function,
);
}