libblur/filter1d/sse/
row_symm_approx_binter_uq0_7.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::edge_mode::border_interpolate;
30use crate::filter1d::filter_scan::ScanPoint1d;
31use crate::filter1d::row_handler_small_approx::{RowsHolder, RowsHolderMut};
32use crate::img_size::ImageSize;
33use crate::BorderHandle;
34#[cfg(target_arch = "x86")]
35use std::arch::x86::*;
36#[cfg(target_arch = "x86_64")]
37use std::arch::x86_64::*;
38
39pub(crate) fn filter_row_sse_symm_u8_uq0_7_any<const N: usize>(
40    edge_mode: BorderHandle,
41    m_src: &RowsHolder<u8>,
42    m_dst: &mut RowsHolderMut<u8>,
43    image_size: ImageSize,
44    scanned_kernel: &[ScanPoint1d<i32>],
45) {
46    unsafe {
47        let mut shifted = scanned_kernel
48            .iter()
49            .map(|&x| ((x.weight) >> 8).min(i8::MAX as i32) as i8)
50            .collect::<Vec<_>>();
51        let mut sum: u32 = shifted.iter().map(|&x| x as u32).sum();
52        if sum > 128 {
53            let half = shifted.len() / 2;
54            while sum > 128 {
55                shifted[half] = shifted[half].saturating_sub(1);
56                sum -= 1;
57            }
58        } else if sum < 128 {
59            let half = shifted.len() / 2;
60            while sum < 128 {
61                shifted[half] = shifted[half].saturating_add(1);
62                sum += 1;
63            }
64        }
65        let unit = ExecutionUnit::<N>::default();
66        unit.pass(edge_mode, m_src, m_dst, image_size, &shifted);
67    }
68}
69
70#[derive(Default, Copy, Clone)]
71struct ExecutionUnit<const N: usize> {}
72
73impl<const N: usize> ExecutionUnit<N> {
74    #[target_feature(enable = "sse4.1")]
75    unsafe fn pass(
76        &self,
77        edge_mode: BorderHandle,
78        m_src: &RowsHolder<u8>,
79        m_dst: &mut RowsHolderMut<u8>,
80        image_size: ImageSize,
81        scanned_kernel: &[i8],
82    ) {
83        let width = image_size.width;
84
85        let v_prepared = scanned_kernel
86            .iter()
87            .map(|&x| {
88                let z = x.to_ne_bytes();
89                i32::from_ne_bytes([z[0], z[0], z[0], z[0]])
90            })
91            .collect::<Vec<_>>();
92
93        let length = scanned_kernel.len();
94        let half_len = length / 2;
95
96        let min_left = half_len.min(width);
97        let s_kernel = half_len as i64;
98
99        for (src, dst) in m_src.holder.iter().zip(m_dst.holder.iter_mut()) {
100            let dst = &mut **dst;
101            let mut f_cx = 0usize;
102
103            let coeff = *scanned_kernel.get_unchecked(half_len);
104
105            while f_cx < min_left {
106                for c in 0..N {
107                    let mx = f_cx as i64 - s_kernel;
108                    let mut k0: u16 = *src.get_unchecked(f_cx * N + c) as u16 * coeff as u16;
109
110                    for i in 0..half_len {
111                        let coeff = *scanned_kernel.get_unchecked(i);
112                        let rollback = length - i - 1;
113
114                        let src0 = border_interpolate!(
115                            src,
116                            edge_mode,
117                            i as i64 + mx,
118                            0,
119                            width as i64,
120                            N,
121                            c
122                        );
123                        let src1 = border_interpolate!(
124                            src,
125                            edge_mode,
126                            rollback as i64 + mx,
127                            0,
128                            width as i64,
129                            N,
130                            c
131                        );
132
133                        k0 += (src0 as u16 + src1 as u16) * coeff as u16;
134                    }
135
136                    *dst.get_unchecked_mut(f_cx * N + c) = ((k0 + (1 << 6)) >> 7).min(255) as u8;
137                }
138                f_cx += 1;
139            }
140
141            let mut m_cx = f_cx * N;
142
143            let s_half = half_len * N;
144            let m_right = width.saturating_sub(half_len);
145            let max_width = m_right * N;
146
147            let coeff = _mm_set1_epi32(*v_prepared.get_unchecked(half_len));
148            let rnd = _mm_set1_epi16(1 << 6);
149
150            while m_cx + 16 < max_width {
151                let cx = m_cx - s_half;
152                let shifted_src = src.get_unchecked(cx..);
153
154                let source =
155                    _mm_loadu_si128(shifted_src.get_unchecked(half_len * N..).as_ptr() as *const _);
156                let mut k0 = _mm_add_epi16(
157                    rnd,
158                    _mm_maddubs_epi16(_mm_unpacklo_epi8(source, _mm_setzero_si128()), coeff),
159                );
160                let mut k1 = _mm_add_epi16(
161                    rnd,
162                    _mm_maddubs_epi16(_mm_unpackhi_epi8(source, _mm_setzero_si128()), coeff),
163                );
164
165                for i in 0..half_len {
166                    let rollback = length - i - 1;
167                    let coeff = _mm_set1_epi32(*v_prepared.get_unchecked(i));
168                    let v_source0 =
169                        _mm_loadu_si128(shifted_src.get_unchecked((i * N)..).as_ptr() as *const _);
170                    let v_source1 = _mm_loadu_si128(
171                        shifted_src.get_unchecked((rollback * N)..).as_ptr() as *const _,
172                    );
173                    k0 = _mm_add_epi16(
174                        k0,
175                        _mm_maddubs_epi16(_mm_unpacklo_epi8(v_source0, v_source1), coeff),
176                    );
177                    k1 = _mm_add_epi16(
178                        k1,
179                        _mm_maddubs_epi16(_mm_unpackhi_epi8(v_source0, v_source1), coeff),
180                    );
181                }
182
183                k0 = _mm_srai_epi16::<7>(k0);
184                k1 = _mm_srai_epi16::<7>(k1);
185
186                let dst_ptr0 = dst.get_unchecked_mut(m_cx..).as_mut_ptr();
187                _mm_storeu_si128(dst_ptr0 as *mut __m128i, _mm_packus_epi16(k0, k1));
188                m_cx += 16;
189            }
190
191            while m_cx + 8 < max_width {
192                let cx = m_cx - s_half;
193                let shifted_src = src.get_unchecked(cx..);
194
195                let source =
196                    _mm_loadu_si64(shifted_src.get_unchecked(half_len * N..).as_ptr() as *const _);
197                let mut k0 = _mm_add_epi16(
198                    rnd,
199                    _mm_maddubs_epi16(_mm_unpacklo_epi8(source, _mm_setzero_si128()), coeff),
200                );
201
202                for i in 0..half_len {
203                    let rollback = length - i - 1;
204                    let coeff = _mm_set1_epi32(*v_prepared.get_unchecked(i));
205                    let v_source0 =
206                        _mm_loadu_si64(shifted_src.get_unchecked((i * N)..).as_ptr() as *const _);
207                    let v_source1 = _mm_loadu_si64(
208                        shifted_src.get_unchecked((rollback * N)..).as_ptr() as *const _,
209                    );
210                    k0 = _mm_add_epi16(
211                        k0,
212                        _mm_maddubs_epi16(_mm_unpacklo_epi8(v_source0, v_source1), coeff),
213                    );
214                }
215
216                k0 = _mm_srai_epi16::<7>(k0);
217
218                let dst_ptr0 = dst.get_unchecked_mut(m_cx..).as_mut_ptr();
219                _mm_storeu_si64(
220                    dst_ptr0 as *mut _,
221                    _mm_packus_epi16(k0, _mm_setzero_si128()),
222                );
223                m_cx += 8;
224            }
225
226            while m_cx + 4 < max_width {
227                let cx = m_cx - s_half;
228                let shifted_src = src.get_unchecked(cx..);
229
230                let source =
231                    _mm_loadu_si32(shifted_src.get_unchecked(half_len * N..).as_ptr() as *const _);
232                let mut k0 = _mm_add_epi16(
233                    rnd,
234                    _mm_maddubs_epi16(_mm_unpacklo_epi8(source, _mm_setzero_si128()), coeff),
235                );
236
237                for i in 0..half_len {
238                    let rollback = length - i - 1;
239                    let coeff = _mm_set1_epi32(*v_prepared.get_unchecked(i));
240                    let v_source0 =
241                        _mm_loadu_si32(shifted_src.get_unchecked((i * N)..).as_ptr() as *const _);
242                    let v_source1 = _mm_loadu_si32(
243                        shifted_src.get_unchecked((rollback * N)..).as_ptr() as *const _,
244                    );
245                    k0 = _mm_add_epi16(
246                        k0,
247                        _mm_maddubs_epi16(_mm_unpacklo_epi8(v_source0, v_source1), coeff),
248                    );
249                }
250
251                k0 = _mm_srai_epi16::<7>(k0);
252
253                let dst_ptr0 = dst.get_unchecked_mut(m_cx..).as_mut_ptr();
254                _mm_storeu_si32(
255                    dst_ptr0 as *mut _,
256                    _mm_packus_epi16(k0, _mm_setzero_si128()),
257                );
258                m_cx += 4;
259            }
260
261            let coeff = *scanned_kernel.get_unchecked(half_len);
262
263            for zx in m_cx..max_width {
264                let x = zx - s_half;
265
266                let shifted_src = src.get_unchecked(x..);
267                let mut k0 = *shifted_src.get_unchecked(half_len * N) as u16 * coeff as u16;
268
269                for i in 0..half_len {
270                    let coeff = *scanned_kernel.get_unchecked(i);
271                    let rollback = length - i - 1;
272
273                    k0 += (*shifted_src.get_unchecked(i * N) as u16
274                        + *shifted_src.get_unchecked(rollback * N) as u16)
275                        * coeff as u16;
276                }
277
278                *dst.get_unchecked_mut(zx) = ((k0 + (1 << 6)) >> 7).min(255) as u8;
279            }
280
281            f_cx = m_right;
282
283            while f_cx < width {
284                for c in 0..N {
285                    let mx = f_cx as i64 - s_kernel;
286                    let mut k0: u16 = *src.get_unchecked(f_cx * N + c) as u16 * coeff as u16;
287
288                    for i in 0..half_len {
289                        let coeff = *scanned_kernel.get_unchecked(i);
290                        let rollback = length - i - 1;
291
292                        let src0 = border_interpolate!(
293                            src,
294                            edge_mode,
295                            i as i64 + mx,
296                            0,
297                            width as i64,
298                            N,
299                            c
300                        );
301                        let src1 = border_interpolate!(
302                            src,
303                            edge_mode,
304                            rollback as i64 + mx,
305                            0,
306                            width as i64,
307                            N,
308                            c
309                        );
310
311                        k0 += (src0 as u16 + src1 as u16) * coeff as u16;
312                    }
313
314                    *dst.get_unchecked_mut(f_cx * N + c) = ((k0 + (1 << 6)) >> 7).min(255) as u8;
315                }
316                f_cx += 1;
317            }
318        }
319    }
320}