Skip to main content

libblur/stackblur/
stack_blur_f32.rs

1// Copyright (c) Radzivon Bartoshyk. All rights reserved.
2//
3// Redistribution and use in source and binary forms, with or without modification,
4// are permitted provided that the following conditions are met:
5//
6// 1.  Redistributions of source code must retain the above copyright notice, this
7// list of conditions and the following disclaimer.
8//
9// 2.  Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12//
13// 3.  Neither the name of the copyright holder nor the names of its
14// contributors may be used to endorse or promote products derived from
15// this software without specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#[cfg(all(target_arch = "aarch64", feature = "neon"))]
29use crate::stackblur::neon::{
30    HorizontalNeonStackBlurPassFloat32, VerticalNeonStackBlurPassFloat32,
31};
32#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "sse"))]
33use crate::stackblur::sse::{HorizontalSseStackBlurPassFloat32, VerticalSseStackBlurPassFloat32};
34use crate::stackblur::*;
35use crate::unsafe_slice::UnsafeSlice;
36use crate::{AnisotropicRadius, BlurError, BlurImageMut, FastBlurChannels, ThreadingPolicy};
37
38fn stack_blur_worker_horizontal(
39    slice: &UnsafeSlice<f32>,
40    stride: u32,
41    width: u32,
42    height: u32,
43    radius: u32,
44    channels: FastBlurChannels,
45    thread: usize,
46    thread_count: usize,
47) {
48    fn pass<const N: usize>(
49        slice: &UnsafeSlice<f32>,
50        stride: u32,
51        width: u32,
52        height: u32,
53        radius: u32,
54        thread: usize,
55        thread_count: usize,
56    ) {
57        #[cfg(not(any(
58            all(target_arch = "aarch64", feature = "neon"),
59            any(target_arch = "x86_64", target_arch = "x86"),
60        )))]
61        fn select_blur_pass<const N: usize>() -> impl StackBlurWorkingPass<f32, N> {
62            HorizontalStackBlurPass::<f32, f32, f32, N>::default()
63        }
64        #[cfg(all(target_arch = "aarch64", feature = "neon"))]
65        fn select_blur_pass<const N: usize>() -> impl StackBlurWorkingPass<f32, N> {
66            HorizontalNeonStackBlurPassFloat32::<N>::default()
67        }
68        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
69        fn select_blur_pass<const N: usize>() -> Box<dyn StackBlurWorkingPass<f32, N>> {
70            #[cfg(all(target_arch = "x86_64", feature = "avx"))]
71            {
72                if std::arch::is_x86_feature_detected!("avx2")
73                    && std::arch::is_x86_feature_detected!("fma")
74                {
75                    use avx::HorizontalAvxStackBlurPassFloat32;
76                    return Box::new(HorizontalAvxStackBlurPassFloat32::<N>::default());
77                }
78            }
79            #[cfg(feature = "sse")]
80            if std::arch::is_x86_feature_detected!("sse4.1") {
81                Box::new(HorizontalSseStackBlurPassFloat32::<N>::default())
82            } else {
83                Box::new(HorizontalStackBlurPass::<f32, f32, f32, N>::default())
84            }
85            #[cfg(not(feature = "sse"))]
86            Box::new(HorizontalStackBlurPass::<f32, f32, f32, N>::default())
87        }
88        let executor = select_blur_pass::<N>();
89        executor.pass(slice, stride, width, height, radius, thread, thread_count);
90    }
91    match channels {
92        FastBlurChannels::Plane => {
93            pass::<1>(slice, stride, width, height, radius, thread, thread_count);
94        }
95        FastBlurChannels::Channels3 => {
96            pass::<3>(slice, stride, width, height, radius, thread, thread_count);
97        }
98        FastBlurChannels::Channels4 => {
99            pass::<4>(slice, stride, width, height, radius, thread, thread_count);
100        }
101    }
102}
103
104#[allow(clippy::too_many_arguments)]
105fn stack_blur_worker_vertical(
106    slice: &UnsafeSlice<f32>,
107    stride: u32,
108    width: u32,
109    height: u32,
110    radius: u32,
111    channels: FastBlurChannels,
112    thread: usize,
113    thread_count: usize,
114) {
115    fn pass<const N: usize>(
116        slice: &UnsafeSlice<f32>,
117        stride: u32,
118        width: u32,
119        height: u32,
120        radius: u32,
121        thread: usize,
122        thread_count: usize,
123    ) {
124        #[cfg(not(any(
125            all(target_arch = "aarch64", feature = "neon"),
126            any(target_arch = "x86_64", target_arch = "x86"),
127        )))]
128        fn select_blur_pass<const N: usize>() -> impl StackBlurWorkingPass<f32, N> {
129            VerticalStackBlurPass::<f32, f32, f32, N>::default()
130        }
131        #[cfg(all(target_arch = "aarch64", feature = "neon"))]
132        fn select_blur_pass<const N: usize>() -> impl StackBlurWorkingPass<f32, N> {
133            VerticalNeonStackBlurPassFloat32::<N>::default()
134        }
135        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
136        fn select_blur_pass<const N: usize>() -> Box<dyn StackBlurWorkingPass<f32, N>> {
137            #[cfg(all(target_arch = "x86_64", feature = "avx"))]
138            {
139                if std::arch::is_x86_feature_detected!("avx2")
140                    && std::arch::is_x86_feature_detected!("fma")
141                {
142                    use avx::VerticalAvxStackBlurPassFloat32;
143                    return Box::new(VerticalAvxStackBlurPassFloat32::<N>::default());
144                }
145            }
146            #[cfg(feature = "sse")]
147            if std::arch::is_x86_feature_detected!("sse4.1") {
148                Box::new(VerticalSseStackBlurPassFloat32::<N>::default())
149            } else {
150                Box::new(VerticalStackBlurPass::<f32, f32, f32, N>::default())
151            }
152            #[cfg(not(feature = "sse"))]
153            Box::new(VerticalStackBlurPass::<f32, f32, f32, N>::default())
154        }
155        let executor = select_blur_pass::<N>();
156        executor.pass(slice, stride, width, height, radius, thread, thread_count);
157    }
158    match channels {
159        FastBlurChannels::Plane => {
160            pass::<1>(slice, stride, width, height, radius, thread, thread_count);
161        }
162        FastBlurChannels::Channels3 => {
163            pass::<3>(slice, stride, width, height, radius, thread, thread_count);
164        }
165        FastBlurChannels::Channels4 => {
166            pass::<4>(slice, stride, width, height, radius, thread, thread_count);
167        }
168    }
169}
170
171/// Fastest available blur option in f32, values may be denormalized, or normalized
172///
173/// Fast gaussian approximation using stack blur.
174///
175/// # Arguments
176/// * `image` - mutable buffer contains image data that will be used as a source and destination.
177/// * `radius` - radius almost is not limited, minimum is one.
178/// * `threading_policy` - Threads usage policy
179///
180/// # Complexity
181/// O(1) complexity.
182pub fn stack_blur_f32(
183    image: &mut BlurImageMut<f32>,
184    radius: AnisotropicRadius,
185    threading_policy: ThreadingPolicy,
186) -> Result<(), BlurError> {
187    image.check_layout(None)?;
188    let radius = radius.max(1);
189    let thread_count = threading_policy.thread_count(image.width, image.height) as u32;
190    let stride = image.row_stride();
191    let width = image.width;
192    let height = image.height;
193    let channels = image.channels;
194    if thread_count == 1 {
195        let slice = UnsafeSlice::new(image.data.borrow_mut());
196        stack_blur_worker_horizontal(&slice, stride, width, height, radius.x_axis, channels, 0, 1);
197        stack_blur_worker_vertical(&slice, stride, width, height, radius.y_axis, channels, 0, 1);
198        return Ok(());
199    }
200    let pool = novtb::ThreadPool::new(thread_count as usize);
201    let slice = UnsafeSlice::new(image.data.borrow_mut());
202    pool.parallel_for(|thread_index| {
203        stack_blur_worker_horizontal(
204            &slice,
205            stride,
206            width,
207            height,
208            radius.x_axis,
209            channels,
210            thread_index,
211            thread_count as usize,
212        );
213    });
214    pool.parallel_for(|thread_index| {
215        stack_blur_worker_vertical(
216            &slice,
217            stride,
218            width,
219            height,
220            radius.y_axis,
221            channels,
222            thread_index,
223            thread_count as usize,
224        );
225    });
226    Ok(())
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    #[test]
234    fn test_stack_blur_f32_q_k5() {
235        let width: usize = 148;
236        let height: usize = 148;
237        let mut dst = vec![0.32423f32; width * height * 3];
238        let mut dst_image = BlurImageMut::borrow(
239            &mut dst,
240            width as u32,
241            height as u32,
242            FastBlurChannels::Channels3,
243        );
244        stack_blur_f32(
245            &mut dst_image,
246            AnisotropicRadius::new(5),
247            ThreadingPolicy::Single,
248        )
249        .unwrap();
250        for (i, &cn) in dst.iter().enumerate() {
251            let diff = (cn - 0.32423f32).abs();
252            assert!(
253                diff <= 1e-4,
254                "Diff expected to be less than 1e-4 but it was {diff} at {i}"
255            );
256        }
257    }
258}