Skip to main content

libblur/filter1d/sse/
filter_column_approx.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::filter1d::arena::Arena;
30use crate::filter1d::filter_scan::ScanPoint1d;
31use crate::filter1d::region::FilterRegion;
32use crate::filter1d::sse::utils::{
33    _mm_mul_add_epi8_by_epi16_x4, _mm_mul_epi8_by_epi16_x4, _mm_pack_epi32_x2_epi8,
34};
35use crate::filter1d::to_approx_storage::ToApproxStorage;
36use crate::img_size::ImageSize;
37use crate::sse::{
38    _mm_load_pack_x2, _mm_load_pack_x3, _mm_load_pack_x4, _mm_store_pack_x2, _mm_store_pack_x3,
39    _mm_store_pack_x4,
40};
41#[cfg(target_arch = "x86")]
42use std::arch::x86::*;
43#[cfg(target_arch = "x86_64")]
44use std::arch::x86_64::*;
45use std::ops::{Add, Mul};
46
47pub(crate) fn filter_column_sse_u8_i32_app(
48    arena: Arena,
49    arena_src: &[&[u8]],
50    dst: &mut [u8],
51    image_size: ImageSize,
52    filter_region: FilterRegion,
53    scanned_kernel: &[ScanPoint1d<i32>],
54) {
55    unsafe {
56        filter_column_sse_u8_i32_impl(
57            arena,
58            arena_src,
59            dst,
60            image_size,
61            filter_region,
62            scanned_kernel,
63        );
64    }
65}
66
67#[target_feature(enable = "sse4.1")]
68unsafe fn filter_column_sse_u8_i32_impl(
69    arena: Arena,
70    arena_src: &[&[u8]],
71    dst: &mut [u8],
72    image_size: ImageSize,
73    _: FilterRegion,
74    scanned_kernel: &[ScanPoint1d<i32>],
75) {
76    unsafe {
77        let image_width = image_size.width * arena.components;
78
79        let length = scanned_kernel.len();
80
81        let mut cx = 0usize;
82
83        let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(0).weight as i16);
84
85        while cx + 64 < image_width {
86            let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
87
88            let source = _mm_load_pack_x4(v_src.as_ptr());
89            let mut k0 = _mm_mul_epi8_by_epi16_x4(source.0, coeff);
90            let mut k1 = _mm_mul_epi8_by_epi16_x4(source.1, coeff);
91            let mut k2 = _mm_mul_epi8_by_epi16_x4(source.2, coeff);
92            let mut k3 = _mm_mul_epi8_by_epi16_x4(source.3, coeff);
93
94            for i in 1..length {
95                let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(i).weight as i16);
96                let v_source =
97                    _mm_load_pack_x4(arena_src.get_unchecked(i).get_unchecked(cx..).as_ptr());
98                k0 = _mm_mul_add_epi8_by_epi16_x4(k0, v_source.0, coeff);
99                k1 = _mm_mul_add_epi8_by_epi16_x4(k1, v_source.1, coeff);
100                k2 = _mm_mul_add_epi8_by_epi16_x4(k2, v_source.2, coeff);
101                k3 = _mm_mul_add_epi8_by_epi16_x4(k3, v_source.3, coeff);
102            }
103
104            let dst_ptr0 = dst.get_unchecked_mut(cx..).as_mut_ptr();
105            _mm_store_pack_x4(
106                dst_ptr0,
107                (
108                    _mm_pack_epi32_x2_epi8(k0),
109                    _mm_pack_epi32_x2_epi8(k1),
110                    _mm_pack_epi32_x2_epi8(k2),
111                    _mm_pack_epi32_x2_epi8(k3),
112                ),
113            );
114            cx += 64;
115        }
116
117        while cx + 48 < image_width {
118            let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
119
120            let source = _mm_load_pack_x3(v_src.as_ptr());
121            let mut k0 = _mm_mul_epi8_by_epi16_x4(source.0, coeff);
122            let mut k1 = _mm_mul_epi8_by_epi16_x4(source.1, coeff);
123            let mut k2 = _mm_mul_epi8_by_epi16_x4(source.2, coeff);
124
125            for i in 1..length {
126                let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(i).weight as i16);
127                let v_source =
128                    _mm_load_pack_x3(arena_src.get_unchecked(i).get_unchecked(cx..).as_ptr());
129                k0 = _mm_mul_add_epi8_by_epi16_x4(k0, v_source.0, coeff);
130                k1 = _mm_mul_add_epi8_by_epi16_x4(k1, v_source.1, coeff);
131                k2 = _mm_mul_add_epi8_by_epi16_x4(k2, v_source.2, coeff);
132            }
133
134            let dst_ptr0 = dst.get_unchecked_mut(cx..).as_mut_ptr();
135            _mm_store_pack_x3(
136                dst_ptr0,
137                (
138                    _mm_pack_epi32_x2_epi8(k0),
139                    _mm_pack_epi32_x2_epi8(k1),
140                    _mm_pack_epi32_x2_epi8(k2),
141                ),
142            );
143            cx += 48;
144        }
145
146        while cx + 32 < image_width {
147            let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
148
149            let source = _mm_load_pack_x2(v_src.as_ptr());
150            let mut k0 = _mm_mul_epi8_by_epi16_x4(source.0, coeff);
151            let mut k1 = _mm_mul_epi8_by_epi16_x4(source.1, coeff);
152
153            for i in 1..length {
154                let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(i).weight as i16);
155                let v_source =
156                    _mm_load_pack_x2(arena_src.get_unchecked(i).get_unchecked(cx..).as_ptr());
157                k0 = _mm_mul_add_epi8_by_epi16_x4(k0, v_source.0, coeff);
158                k1 = _mm_mul_add_epi8_by_epi16_x4(k1, v_source.1, coeff);
159            }
160
161            let dst_ptr0 = dst.get_unchecked_mut(cx..).as_mut_ptr();
162            _mm_store_pack_x2(
163                dst_ptr0,
164                (_mm_pack_epi32_x2_epi8(k0), _mm_pack_epi32_x2_epi8(k1)),
165            );
166            cx += 32;
167        }
168
169        while cx + 16 < image_width {
170            let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
171
172            let source = _mm_loadu_si128(v_src.as_ptr() as *const __m128i);
173            let mut k0 = _mm_mul_epi8_by_epi16_x4(source, coeff);
174
175            for i in 1..length {
176                let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(i).weight as i16);
177                let v_source = _mm_loadu_si128(
178                    arena_src.get_unchecked(i).get_unchecked(cx..).as_ptr() as *const __m128i,
179                );
180                k0 = _mm_mul_add_epi8_by_epi16_x4(k0, v_source, coeff);
181            }
182
183            let dst_ptr0 = dst.get_unchecked_mut(cx..).as_mut_ptr();
184            _mm_storeu_si128(dst_ptr0 as *mut __m128i, _mm_pack_epi32_x2_epi8(k0));
185            cx += 16;
186        }
187
188        let coeff = *scanned_kernel.get_unchecked(0);
189
190        while cx + 4 < image_width {
191            let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
192
193            let mut k0 = (*v_src.get_unchecked(0) as i32).mul(coeff.weight);
194            let mut k1 = (*v_src.get_unchecked(1) as i32).mul(coeff.weight);
195            let mut k2 = (*v_src.get_unchecked(2) as i32).mul(coeff.weight);
196            let mut k3 = (*v_src.get_unchecked(3) as i32).mul(coeff.weight);
197
198            for i in 1..length {
199                let coeff = *scanned_kernel.get_unchecked(i);
200                k0 = ((*arena_src.get_unchecked(i).get_unchecked(cx)) as i32)
201                    .mul(coeff.weight)
202                    .add(k0);
203                k1 = ((*arena_src.get_unchecked(i).get_unchecked(cx + 1)) as i32)
204                    .mul(coeff.weight)
205                    .add(k1);
206                k2 = ((*arena_src.get_unchecked(i).get_unchecked(cx + 2)) as i32)
207                    .mul(coeff.weight)
208                    .add(k2);
209                k3 = ((*arena_src.get_unchecked(i).get_unchecked(cx + 3)) as i32)
210                    .mul(coeff.weight)
211                    .add(k3);
212            }
213
214            *dst.get_unchecked_mut(cx) = k0.to_approx_();
215            *dst.get_unchecked_mut(cx + 1) = k1.to_approx_();
216            *dst.get_unchecked_mut(cx + 2) = k2.to_approx_();
217            *dst.get_unchecked_mut(cx + 3) = k3.to_approx_();
218            cx += 4;
219        }
220
221        for x in cx..image_width {
222            let v_src = arena_src.get_unchecked(0).get_unchecked(x..);
223
224            let mut k0 = ((*v_src.get_unchecked(0)) as i32).mul(coeff.weight);
225
226            for i in 1..length {
227                let coeff = *scanned_kernel.get_unchecked(i);
228                k0 = ((*arena_src.get_unchecked(i).get_unchecked(x)) as i32)
229                    .mul(coeff.weight)
230                    .add(k0);
231            }
232
233            *dst.get_unchecked_mut(x) = k0.to_approx_();
234        }
235    }
236}