1use crate::edge_mode::border_interpolate;
30use crate::filter1d::filter_scan::ScanPoint1d;
31use crate::filter1d::row_handler_small_approx::{RowsHolder, RowsHolderMut};
32use crate::img_size::ImageSize;
33use crate::BorderHandle;
34#[cfg(target_arch = "x86")]
35use std::arch::x86::*;
36#[cfg(target_arch = "x86_64")]
37use std::arch::x86_64::*;
38
39pub(crate) fn filter_row_sse_symm_u8_uq0_7_any<const N: usize>(
40 edge_mode: BorderHandle,
41 m_src: &RowsHolder<u8>,
42 m_dst: &mut RowsHolderMut<u8>,
43 image_size: ImageSize,
44 scanned_kernel: &[ScanPoint1d<i32>],
45) {
46 unsafe {
47 let mut shifted = scanned_kernel
48 .iter()
49 .map(|&x| ((x.weight) >> 8).min(i8::MAX as i32) as i8)
50 .collect::<Vec<_>>();
51 let mut sum: u32 = shifted.iter().map(|&x| x as u32).sum();
52 if sum > 128 {
53 let half = shifted.len() / 2;
54 while sum > 128 {
55 shifted[half] = shifted[half].saturating_sub(1);
56 sum -= 1;
57 }
58 } else if sum < 128 {
59 let half = shifted.len() / 2;
60 while sum < 128 {
61 shifted[half] = shifted[half].saturating_add(1);
62 sum += 1;
63 }
64 }
65 let unit = ExecutionUnit::<N>::default();
66 unit.pass(edge_mode, m_src, m_dst, image_size, &shifted);
67 }
68}
69
70#[derive(Default, Copy, Clone)]
71struct ExecutionUnit<const N: usize> {}
72
73impl<const N: usize> ExecutionUnit<N> {
74 #[target_feature(enable = "sse4.1")]
75 unsafe fn pass(
76 &self,
77 edge_mode: BorderHandle,
78 m_src: &RowsHolder<u8>,
79 m_dst: &mut RowsHolderMut<u8>,
80 image_size: ImageSize,
81 scanned_kernel: &[i8],
82 ) {
83 let width = image_size.width;
84
85 let v_prepared = scanned_kernel
86 .iter()
87 .map(|&x| {
88 let z = x.to_ne_bytes();
89 i32::from_ne_bytes([z[0], z[0], z[0], z[0]])
90 })
91 .collect::<Vec<_>>();
92
93 let length = scanned_kernel.len();
94 let half_len = length / 2;
95
96 let min_left = half_len.min(width);
97 let s_kernel = half_len as i64;
98
99 for (src, dst) in m_src.holder.iter().zip(m_dst.holder.iter_mut()) {
100 let dst = &mut **dst;
101 let mut f_cx = 0usize;
102
103 let coeff = *scanned_kernel.get_unchecked(half_len);
104
105 while f_cx < min_left {
106 for c in 0..N {
107 let mx = f_cx as i64 - s_kernel;
108 let mut k0: u16 = *src.get_unchecked(f_cx * N + c) as u16 * coeff as u16;
109
110 for i in 0..half_len {
111 let coeff = *scanned_kernel.get_unchecked(i);
112 let rollback = length - i - 1;
113
114 let src0 = border_interpolate!(
115 src,
116 edge_mode,
117 i as i64 + mx,
118 0,
119 width as i64,
120 N,
121 c
122 );
123 let src1 = border_interpolate!(
124 src,
125 edge_mode,
126 rollback as i64 + mx,
127 0,
128 width as i64,
129 N,
130 c
131 );
132
133 k0 += (src0 as u16 + src1 as u16) * coeff as u16;
134 }
135
136 *dst.get_unchecked_mut(f_cx * N + c) = ((k0 + (1 << 6)) >> 7).min(255) as u8;
137 }
138 f_cx += 1;
139 }
140
141 let mut m_cx = f_cx * N;
142
143 let s_half = half_len * N;
144 let m_right = width.saturating_sub(half_len);
145 let max_width = m_right * N;
146
147 let coeff = _mm_set1_epi32(*v_prepared.get_unchecked(half_len));
148 let rnd = _mm_set1_epi16(1 << 6);
149
150 while m_cx + 16 < max_width {
151 let cx = m_cx - s_half;
152 let shifted_src = src.get_unchecked(cx..);
153
154 let source =
155 _mm_loadu_si128(shifted_src.get_unchecked(half_len * N..).as_ptr() as *const _);
156 let mut k0 = _mm_add_epi16(
157 rnd,
158 _mm_maddubs_epi16(_mm_unpacklo_epi8(source, _mm_setzero_si128()), coeff),
159 );
160 let mut k1 = _mm_add_epi16(
161 rnd,
162 _mm_maddubs_epi16(_mm_unpackhi_epi8(source, _mm_setzero_si128()), coeff),
163 );
164
165 for i in 0..half_len {
166 let rollback = length - i - 1;
167 let coeff = _mm_set1_epi32(*v_prepared.get_unchecked(i));
168 let v_source0 =
169 _mm_loadu_si128(shifted_src.get_unchecked((i * N)..).as_ptr() as *const _);
170 let v_source1 = _mm_loadu_si128(
171 shifted_src.get_unchecked((rollback * N)..).as_ptr() as *const _,
172 );
173 k0 = _mm_add_epi16(
174 k0,
175 _mm_maddubs_epi16(_mm_unpacklo_epi8(v_source0, v_source1), coeff),
176 );
177 k1 = _mm_add_epi16(
178 k1,
179 _mm_maddubs_epi16(_mm_unpackhi_epi8(v_source0, v_source1), coeff),
180 );
181 }
182
183 k0 = _mm_srai_epi16::<7>(k0);
184 k1 = _mm_srai_epi16::<7>(k1);
185
186 let dst_ptr0 = dst.get_unchecked_mut(m_cx..).as_mut_ptr();
187 _mm_storeu_si128(dst_ptr0 as *mut __m128i, _mm_packus_epi16(k0, k1));
188 m_cx += 16;
189 }
190
191 while m_cx + 8 < max_width {
192 let cx = m_cx - s_half;
193 let shifted_src = src.get_unchecked(cx..);
194
195 let source =
196 _mm_loadu_si64(shifted_src.get_unchecked(half_len * N..).as_ptr() as *const _);
197 let mut k0 = _mm_add_epi16(
198 rnd,
199 _mm_maddubs_epi16(_mm_unpacklo_epi8(source, _mm_setzero_si128()), coeff),
200 );
201
202 for i in 0..half_len {
203 let rollback = length - i - 1;
204 let coeff = _mm_set1_epi32(*v_prepared.get_unchecked(i));
205 let v_source0 =
206 _mm_loadu_si64(shifted_src.get_unchecked((i * N)..).as_ptr() as *const _);
207 let v_source1 = _mm_loadu_si64(
208 shifted_src.get_unchecked((rollback * N)..).as_ptr() as *const _,
209 );
210 k0 = _mm_add_epi16(
211 k0,
212 _mm_maddubs_epi16(_mm_unpacklo_epi8(v_source0, v_source1), coeff),
213 );
214 }
215
216 k0 = _mm_srai_epi16::<7>(k0);
217
218 let dst_ptr0 = dst.get_unchecked_mut(m_cx..).as_mut_ptr();
219 _mm_storeu_si64(
220 dst_ptr0 as *mut _,
221 _mm_packus_epi16(k0, _mm_setzero_si128()),
222 );
223 m_cx += 8;
224 }
225
226 while m_cx + 4 < max_width {
227 let cx = m_cx - s_half;
228 let shifted_src = src.get_unchecked(cx..);
229
230 let source =
231 _mm_loadu_si32(shifted_src.get_unchecked(half_len * N..).as_ptr() as *const _);
232 let mut k0 = _mm_add_epi16(
233 rnd,
234 _mm_maddubs_epi16(_mm_unpacklo_epi8(source, _mm_setzero_si128()), coeff),
235 );
236
237 for i in 0..half_len {
238 let rollback = length - i - 1;
239 let coeff = _mm_set1_epi32(*v_prepared.get_unchecked(i));
240 let v_source0 =
241 _mm_loadu_si32(shifted_src.get_unchecked((i * N)..).as_ptr() as *const _);
242 let v_source1 = _mm_loadu_si32(
243 shifted_src.get_unchecked((rollback * N)..).as_ptr() as *const _,
244 );
245 k0 = _mm_add_epi16(
246 k0,
247 _mm_maddubs_epi16(_mm_unpacklo_epi8(v_source0, v_source1), coeff),
248 );
249 }
250
251 k0 = _mm_srai_epi16::<7>(k0);
252
253 let dst_ptr0 = dst.get_unchecked_mut(m_cx..).as_mut_ptr();
254 _mm_storeu_si32(
255 dst_ptr0 as *mut _,
256 _mm_packus_epi16(k0, _mm_setzero_si128()),
257 );
258 m_cx += 4;
259 }
260
261 let coeff = *scanned_kernel.get_unchecked(half_len);
262
263 for zx in m_cx..max_width {
264 let x = zx - s_half;
265
266 let shifted_src = src.get_unchecked(x..);
267 let mut k0 = *shifted_src.get_unchecked(half_len * N) as u16 * coeff as u16;
268
269 for i in 0..half_len {
270 let coeff = *scanned_kernel.get_unchecked(i);
271 let rollback = length - i - 1;
272
273 k0 += (*shifted_src.get_unchecked(i * N) as u16
274 + *shifted_src.get_unchecked(rollback * N) as u16)
275 * coeff as u16;
276 }
277
278 *dst.get_unchecked_mut(zx) = ((k0 + (1 << 6)) >> 7).min(255) as u8;
279 }
280
281 f_cx = m_right;
282
283 while f_cx < width {
284 for c in 0..N {
285 let mx = f_cx as i64 - s_kernel;
286 let mut k0: u16 = *src.get_unchecked(f_cx * N + c) as u16 * coeff as u16;
287
288 for i in 0..half_len {
289 let coeff = *scanned_kernel.get_unchecked(i);
290 let rollback = length - i - 1;
291
292 let src0 = border_interpolate!(
293 src,
294 edge_mode,
295 i as i64 + mx,
296 0,
297 width as i64,
298 N,
299 c
300 );
301 let src1 = border_interpolate!(
302 src,
303 edge_mode,
304 rollback as i64 + mx,
305 0,
306 width as i64,
307 N,
308 c
309 );
310
311 k0 += (src0 as u16 + src1 as u16) * coeff as u16;
312 }
313
314 *dst.get_unchecked_mut(f_cx * N + c) = ((k0 + (1 << 6)) >> 7).min(255) as u8;
315 }
316 f_cx += 1;
317 }
318 }
319 }
320}