libblur/filter1d/sse/
filter_column_approx.rs1use crate::filter1d::arena::Arena;
30use crate::filter1d::filter_scan::ScanPoint1d;
31use crate::filter1d::region::FilterRegion;
32use crate::filter1d::sse::utils::{
33 _mm_mul_add_epi8_by_epi16_x4, _mm_mul_epi8_by_epi16_x4, _mm_pack_epi32_x2_epi8,
34};
35use crate::filter1d::to_approx_storage::ToApproxStorage;
36use crate::img_size::ImageSize;
37use crate::sse::{
38 _mm_load_pack_x2, _mm_load_pack_x3, _mm_load_pack_x4, _mm_store_pack_x2, _mm_store_pack_x3,
39 _mm_store_pack_x4,
40};
41#[cfg(target_arch = "x86")]
42use std::arch::x86::*;
43#[cfg(target_arch = "x86_64")]
44use std::arch::x86_64::*;
45use std::ops::{Add, Mul};
46
47pub(crate) fn filter_column_sse_u8_i32_app(
48 arena: Arena,
49 arena_src: &[&[u8]],
50 dst: &mut [u8],
51 image_size: ImageSize,
52 filter_region: FilterRegion,
53 scanned_kernel: &[ScanPoint1d<i32>],
54) {
55 unsafe {
56 filter_column_sse_u8_i32_impl(
57 arena,
58 arena_src,
59 dst,
60 image_size,
61 filter_region,
62 scanned_kernel,
63 );
64 }
65}
66
67#[target_feature(enable = "sse4.1")]
68unsafe fn filter_column_sse_u8_i32_impl(
69 arena: Arena,
70 arena_src: &[&[u8]],
71 dst: &mut [u8],
72 image_size: ImageSize,
73 _: FilterRegion,
74 scanned_kernel: &[ScanPoint1d<i32>],
75) {
76 unsafe {
77 let image_width = image_size.width * arena.components;
78
79 let length = scanned_kernel.len();
80
81 let mut cx = 0usize;
82
83 let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(0).weight as i16);
84
85 while cx + 64 < image_width {
86 let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
87
88 let source = _mm_load_pack_x4(v_src.as_ptr());
89 let mut k0 = _mm_mul_epi8_by_epi16_x4(source.0, coeff);
90 let mut k1 = _mm_mul_epi8_by_epi16_x4(source.1, coeff);
91 let mut k2 = _mm_mul_epi8_by_epi16_x4(source.2, coeff);
92 let mut k3 = _mm_mul_epi8_by_epi16_x4(source.3, coeff);
93
94 for i in 1..length {
95 let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(i).weight as i16);
96 let v_source =
97 _mm_load_pack_x4(arena_src.get_unchecked(i).get_unchecked(cx..).as_ptr());
98 k0 = _mm_mul_add_epi8_by_epi16_x4(k0, v_source.0, coeff);
99 k1 = _mm_mul_add_epi8_by_epi16_x4(k1, v_source.1, coeff);
100 k2 = _mm_mul_add_epi8_by_epi16_x4(k2, v_source.2, coeff);
101 k3 = _mm_mul_add_epi8_by_epi16_x4(k3, v_source.3, coeff);
102 }
103
104 let dst_ptr0 = dst.get_unchecked_mut(cx..).as_mut_ptr();
105 _mm_store_pack_x4(
106 dst_ptr0,
107 (
108 _mm_pack_epi32_x2_epi8(k0),
109 _mm_pack_epi32_x2_epi8(k1),
110 _mm_pack_epi32_x2_epi8(k2),
111 _mm_pack_epi32_x2_epi8(k3),
112 ),
113 );
114 cx += 64;
115 }
116
117 while cx + 48 < image_width {
118 let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
119
120 let source = _mm_load_pack_x3(v_src.as_ptr());
121 let mut k0 = _mm_mul_epi8_by_epi16_x4(source.0, coeff);
122 let mut k1 = _mm_mul_epi8_by_epi16_x4(source.1, coeff);
123 let mut k2 = _mm_mul_epi8_by_epi16_x4(source.2, coeff);
124
125 for i in 1..length {
126 let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(i).weight as i16);
127 let v_source =
128 _mm_load_pack_x3(arena_src.get_unchecked(i).get_unchecked(cx..).as_ptr());
129 k0 = _mm_mul_add_epi8_by_epi16_x4(k0, v_source.0, coeff);
130 k1 = _mm_mul_add_epi8_by_epi16_x4(k1, v_source.1, coeff);
131 k2 = _mm_mul_add_epi8_by_epi16_x4(k2, v_source.2, coeff);
132 }
133
134 let dst_ptr0 = dst.get_unchecked_mut(cx..).as_mut_ptr();
135 _mm_store_pack_x3(
136 dst_ptr0,
137 (
138 _mm_pack_epi32_x2_epi8(k0),
139 _mm_pack_epi32_x2_epi8(k1),
140 _mm_pack_epi32_x2_epi8(k2),
141 ),
142 );
143 cx += 48;
144 }
145
146 while cx + 32 < image_width {
147 let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
148
149 let source = _mm_load_pack_x2(v_src.as_ptr());
150 let mut k0 = _mm_mul_epi8_by_epi16_x4(source.0, coeff);
151 let mut k1 = _mm_mul_epi8_by_epi16_x4(source.1, coeff);
152
153 for i in 1..length {
154 let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(i).weight as i16);
155 let v_source =
156 _mm_load_pack_x2(arena_src.get_unchecked(i).get_unchecked(cx..).as_ptr());
157 k0 = _mm_mul_add_epi8_by_epi16_x4(k0, v_source.0, coeff);
158 k1 = _mm_mul_add_epi8_by_epi16_x4(k1, v_source.1, coeff);
159 }
160
161 let dst_ptr0 = dst.get_unchecked_mut(cx..).as_mut_ptr();
162 _mm_store_pack_x2(
163 dst_ptr0,
164 (_mm_pack_epi32_x2_epi8(k0), _mm_pack_epi32_x2_epi8(k1)),
165 );
166 cx += 32;
167 }
168
169 while cx + 16 < image_width {
170 let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
171
172 let source = _mm_loadu_si128(v_src.as_ptr() as *const __m128i);
173 let mut k0 = _mm_mul_epi8_by_epi16_x4(source, coeff);
174
175 for i in 1..length {
176 let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(i).weight as i16);
177 let v_source = _mm_loadu_si128(
178 arena_src.get_unchecked(i).get_unchecked(cx..).as_ptr() as *const __m128i,
179 );
180 k0 = _mm_mul_add_epi8_by_epi16_x4(k0, v_source, coeff);
181 }
182
183 let dst_ptr0 = dst.get_unchecked_mut(cx..).as_mut_ptr();
184 _mm_storeu_si128(dst_ptr0 as *mut __m128i, _mm_pack_epi32_x2_epi8(k0));
185 cx += 16;
186 }
187
188 let coeff = *scanned_kernel.get_unchecked(0);
189
190 while cx + 4 < image_width {
191 let v_src = arena_src.get_unchecked(0).get_unchecked(cx..);
192
193 let mut k0 = (*v_src.get_unchecked(0) as i32).mul(coeff.weight);
194 let mut k1 = (*v_src.get_unchecked(1) as i32).mul(coeff.weight);
195 let mut k2 = (*v_src.get_unchecked(2) as i32).mul(coeff.weight);
196 let mut k3 = (*v_src.get_unchecked(3) as i32).mul(coeff.weight);
197
198 for i in 1..length {
199 let coeff = *scanned_kernel.get_unchecked(i);
200 k0 = ((*arena_src.get_unchecked(i).get_unchecked(cx)) as i32)
201 .mul(coeff.weight)
202 .add(k0);
203 k1 = ((*arena_src.get_unchecked(i).get_unchecked(cx + 1)) as i32)
204 .mul(coeff.weight)
205 .add(k1);
206 k2 = ((*arena_src.get_unchecked(i).get_unchecked(cx + 2)) as i32)
207 .mul(coeff.weight)
208 .add(k2);
209 k3 = ((*arena_src.get_unchecked(i).get_unchecked(cx + 3)) as i32)
210 .mul(coeff.weight)
211 .add(k3);
212 }
213
214 *dst.get_unchecked_mut(cx) = k0.to_approx_();
215 *dst.get_unchecked_mut(cx + 1) = k1.to_approx_();
216 *dst.get_unchecked_mut(cx + 2) = k2.to_approx_();
217 *dst.get_unchecked_mut(cx + 3) = k3.to_approx_();
218 cx += 4;
219 }
220
221 for x in cx..image_width {
222 let v_src = arena_src.get_unchecked(0).get_unchecked(x..);
223
224 let mut k0 = ((*v_src.get_unchecked(0)) as i32).mul(coeff.weight);
225
226 for i in 1..length {
227 let coeff = *scanned_kernel.get_unchecked(i);
228 k0 = ((*arena_src.get_unchecked(i).get_unchecked(x)) as i32)
229 .mul(coeff.weight)
230 .add(k0);
231 }
232
233 *dst.get_unchecked_mut(x) = k0.to_approx_();
234 }
235 }
236}