1#[cfg(all(target_arch = "aarch64", feature = "neon"))]
29use crate::stackblur::neon::{
30 HorizontalNeonStackBlurPassFloat32, VerticalNeonStackBlurPassFloat32,
31};
32#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
33use crate::stackblur::sse::{HorizontalSseStackBlurPassFloat32, VerticalSseStackBlurPassFloat32};
34use crate::stackblur::*;
35use crate::unsafe_slice::UnsafeSlice;
36use crate::{AnisotropicRadius, BlurError, BlurImageMut, FastBlurChannels, ThreadingPolicy};
37
38fn stack_blur_worker_horizontal(
39 slice: &UnsafeSlice<f32>,
40 stride: u32,
41 width: u32,
42 height: u32,
43 radius: u32,
44 channels: FastBlurChannels,
45 thread: usize,
46 thread_count: usize,
47) {
48 fn pass<const N: usize>(
49 slice: &UnsafeSlice<f32>,
50 stride: u32,
51 width: u32,
52 height: u32,
53 radius: u32,
54 thread: usize,
55 thread_count: usize,
56 ) {
57 #[cfg(not(any(
58 all(target_arch = "aarch64", feature = "neon"),
59 any(target_arch = "x86_64", target_arch = "x86"),
60 )))]
61 fn select_blur_pass<const N: usize>() -> impl StackBlurWorkingPass<f32, N> {
62 HorizontalStackBlurPass::<f32, f32, f32, N>::default()
63 }
64 #[cfg(all(target_arch = "aarch64", feature = "neon"))]
65 fn select_blur_pass<const N: usize>() -> impl StackBlurWorkingPass<f32, N> {
66 HorizontalNeonStackBlurPassFloat32::<N>::default()
67 }
68 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
69 fn select_blur_pass<const N: usize>() -> Box<dyn StackBlurWorkingPass<f32, N>> {
70 #[cfg(all(target_arch = "x86_64", feature = "avx"))]
71 {
72 if std::arch::is_x86_feature_detected!("avx2")
73 && std::arch::is_x86_feature_detected!("fma")
74 {
75 use avx::HorizontalAvxStackBlurPassFloat32;
76 return Box::new(HorizontalAvxStackBlurPassFloat32::<N>::default());
77 }
78 }
79 #[cfg(feature = "sse")]
80 if std::arch::is_x86_feature_detected!("sse4.1") {
81 Box::new(HorizontalSseStackBlurPassFloat32::<N>::default())
82 } else {
83 Box::new(HorizontalStackBlurPass::<f32, f32, f32, N>::default())
84 }
85 #[cfg(not(feature = "sse"))]
86 Box::new(HorizontalStackBlurPass::<f32, f32, f32, N>::default())
87 }
88 let executor = select_blur_pass::<N>();
89 executor.pass(slice, stride, width, height, radius, thread, thread_count);
90 }
91 match channels {
92 FastBlurChannels::Plane => {
93 pass::<1>(slice, stride, width, height, radius, thread, thread_count);
94 }
95 FastBlurChannels::Channels3 => {
96 pass::<3>(slice, stride, width, height, radius, thread, thread_count);
97 }
98 FastBlurChannels::Channels4 => {
99 pass::<4>(slice, stride, width, height, radius, thread, thread_count);
100 }
101 }
102}
103
104#[allow(clippy::too_many_arguments)]
105fn stack_blur_worker_vertical(
106 slice: &UnsafeSlice<f32>,
107 stride: u32,
108 width: u32,
109 height: u32,
110 radius: u32,
111 channels: FastBlurChannels,
112 thread: usize,
113 thread_count: usize,
114) {
115 fn pass<const N: usize>(
116 slice: &UnsafeSlice<f32>,
117 stride: u32,
118 width: u32,
119 height: u32,
120 radius: u32,
121 thread: usize,
122 thread_count: usize,
123 ) {
124 #[cfg(not(any(
125 all(target_arch = "aarch64", feature = "neon"),
126 any(target_arch = "x86_64", target_arch = "x86"),
127 )))]
128 fn select_blur_pass<const N: usize>() -> impl StackBlurWorkingPass<f32, N> {
129 VerticalStackBlurPass::<f32, f32, f32, N>::default()
130 }
131 #[cfg(all(target_arch = "aarch64", feature = "neon"))]
132 fn select_blur_pass<const N: usize>() -> impl StackBlurWorkingPass<f32, N> {
133 VerticalNeonStackBlurPassFloat32::<N>::default()
134 }
135 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
136 fn select_blur_pass<const N: usize>() -> Box<dyn StackBlurWorkingPass<f32, N>> {
137 #[cfg(all(target_arch = "x86_64", feature = "avx"))]
138 {
139 if std::arch::is_x86_feature_detected!("avx2")
140 && std::arch::is_x86_feature_detected!("fma")
141 {
142 use avx::VerticalAvxStackBlurPassFloat32;
143 return Box::new(VerticalAvxStackBlurPassFloat32::<N>::default());
144 }
145 }
146 #[cfg(feature = "sse")]
147 if std::arch::is_x86_feature_detected!("sse4.1") {
148 Box::new(VerticalSseStackBlurPassFloat32::<N>::default())
149 } else {
150 Box::new(VerticalStackBlurPass::<f32, f32, f32, N>::default())
151 }
152 #[cfg(not(feature = "sse"))]
153 Box::new(VerticalStackBlurPass::<f32, f32, f32, N>::default())
154 }
155 let executor = select_blur_pass::<N>();
156 executor.pass(slice, stride, width, height, radius, thread, thread_count);
157 }
158 match channels {
159 FastBlurChannels::Plane => {
160 pass::<1>(slice, stride, width, height, radius, thread, thread_count);
161 }
162 FastBlurChannels::Channels3 => {
163 pass::<3>(slice, stride, width, height, radius, thread, thread_count);
164 }
165 FastBlurChannels::Channels4 => {
166 pass::<4>(slice, stride, width, height, radius, thread, thread_count);
167 }
168 }
169}
170
171pub fn stack_blur_f32(
183 image: &mut BlurImageMut<f32>,
184 radius: AnisotropicRadius,
185 threading_policy: ThreadingPolicy,
186) -> Result<(), BlurError> {
187 image.check_layout(None)?;
188 let radius = radius.max(1);
189 let thread_count = threading_policy.thread_count(image.width, image.height) as u32;
190 let stride = image.row_stride();
191 let width = image.width;
192 let height = image.height;
193 let channels = image.channels;
194 if thread_count == 1 {
195 let slice = UnsafeSlice::new(image.data.borrow_mut());
196 stack_blur_worker_horizontal(&slice, stride, width, height, radius.x_axis, channels, 0, 1);
197 stack_blur_worker_vertical(&slice, stride, width, height, radius.y_axis, channels, 0, 1);
198 return Ok(());
199 }
200 let pool = novtb::ThreadPool::new(thread_count as usize);
201 let slice = UnsafeSlice::new(image.data.borrow_mut());
202 pool.parallel_for(|thread_index| {
203 stack_blur_worker_horizontal(
204 &slice,
205 stride,
206 width,
207 height,
208 radius.x_axis,
209 channels,
210 thread_index,
211 thread_count as usize,
212 );
213 });
214 pool.parallel_for(|thread_index| {
215 stack_blur_worker_vertical(
216 &slice,
217 stride,
218 width,
219 height,
220 radius.y_axis,
221 channels,
222 thread_index,
223 thread_count as usize,
224 );
225 });
226 Ok(())
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232
233 #[test]
234 fn test_stack_blur_f32_q_k5() {
235 let width: usize = 148;
236 let height: usize = 148;
237 let mut dst = vec![0.32423f32; width * height * 3];
238 let mut dst_image = BlurImageMut::borrow(
239 &mut dst,
240 width as u32,
241 height as u32,
242 FastBlurChannels::Channels3,
243 );
244 stack_blur_f32(
245 &mut dst_image,
246 AnisotropicRadius::new(5),
247 ThreadingPolicy::Single,
248 )
249 .unwrap();
250 for (i, &cn) in dst.iter().enumerate() {
251 let diff = (cn - 0.32423f32).abs();
252 assert!(
253 diff <= 1e-4,
254 "Diff expected to be less than 1e-4 but it was {diff} at {i}"
255 );
256 }
257 }
258}