1#![allow(unsafe_code)]
3
4#[cfg(target_os = "macos")]
5use super::u8ops::gcd;
6use super::u8ops::{ImageF32, RAYON_THRESHOLD};
7use rayon::prelude::*;
8
9#[allow(clippy::uninit_vec)]
18pub fn grayscale_f32(input: &ImageF32) -> Option<ImageF32> {
19 if input.channels() != 3 {
20 return None;
21 }
22 let (h, w) = (input.height(), input.width());
23 let src = input.data();
24 let total = h * w;
25
26 let mut out = Vec::with_capacity(total);
28 #[allow(unsafe_code)]
29 unsafe {
30 out.set_len(total);
31 }
32
33 #[cfg(target_os = "macos")]
35 if total >= RAYON_THRESHOLD && !cfg!(miri) {
36 let n_chunks = 8usize.min(h);
37 let chunk_h = h.div_ceil(n_chunks);
38 let sp = src.as_ptr() as usize;
39 let dp = out.as_mut_ptr() as usize;
40 gcd::parallel_for(n_chunks, |chunk| {
41 let sp = sp as *const f32;
42 let dp = dp as *mut f32;
43 let y_start = chunk * chunk_h;
44 let y_end = ((chunk + 1) * chunk_h).min(h);
45 let n = (y_end - y_start) * w;
46 let src_off = y_start * w * 3;
47 let dst_off = y_start * w;
48 let chunk_src = unsafe { core::slice::from_raw_parts(sp.add(src_off), n * 3) };
49 let chunk_dst = unsafe { core::slice::from_raw_parts_mut(dp.add(dst_off), n) };
50 let done = grayscale_f32_simd(chunk_src, chunk_dst, n);
51 for i in done..n {
52 let base = i * 3;
53 chunk_dst[i] = 0.299 * chunk_src[base]
54 + 0.587 * chunk_src[base + 1]
55 + 0.114 * chunk_src[base + 2];
56 }
57 });
58 return ImageF32::new(out, h, w, 1);
59 }
60
61 let done = grayscale_f32_simd(src, &mut out, total);
63 for i in done..total {
64 let base = i * 3;
65 out[i] = 0.299 * src[base] + 0.587 * src[base + 1] + 0.114 * src[base + 2];
66 }
67
68 ImageF32::new(out, h, w, 1)
69}
70
71fn grayscale_f32_simd(src: &[f32], dst: &mut [f32], total: usize) -> usize {
72 if cfg!(miri) || total < 4 {
73 return 0;
74 }
75
76 #[cfg(target_arch = "aarch64")]
77 {
78 if std::arch::is_aarch64_feature_detected!("neon") {
79 return unsafe { grayscale_f32_neon(src, dst, total) };
80 }
81 }
82 #[cfg(target_arch = "x86_64")]
83 {
84 if is_x86_feature_detected!("sse2") {
85 return unsafe { grayscale_f32_sse(src, dst, total) };
86 }
87 }
88 0
89}
90
91#[cfg(target_arch = "aarch64")]
92#[target_feature(enable = "neon")]
93#[allow(unsafe_op_in_unsafe_fn)]
94unsafe fn grayscale_f32_neon(src: &[f32], dst: &mut [f32], total: usize) -> usize {
95 use std::arch::aarch64::*;
96 let coeff_r = vdupq_n_f32(0.299);
97 let coeff_g = vdupq_n_f32(0.587);
98 let coeff_b = vdupq_n_f32(0.114);
99 let sp = src.as_ptr();
100 let dp = dst.as_mut_ptr();
101 let mut i = 0usize;
102 while i + 4 <= total {
103 let rgb = vld3q_f32(sp.add(i * 3));
104 let gray = vfmaq_f32(
105 vfmaq_f32(vmulq_f32(rgb.0, coeff_r), rgb.1, coeff_g),
106 rgb.2,
107 coeff_b,
108 );
109 vst1q_f32(dp.add(i), gray);
110 i += 4;
111 }
112 i
113}
114
115#[cfg(target_arch = "x86_64")]
116#[target_feature(enable = "sse2")]
117#[allow(unsafe_op_in_unsafe_fn)]
118unsafe fn grayscale_f32_sse(src: &[f32], dst: &mut [f32], total: usize) -> usize {
119 use std::arch::x86_64::*;
120 let coeff_r = _mm_set1_ps(0.299);
121 let coeff_g = _mm_set1_ps(0.587);
122 let coeff_b = _mm_set1_ps(0.114);
123 let sp = src.as_ptr();
124 let dp = dst.as_mut_ptr();
125 let mut i = 0usize;
126 while i + 4 <= total {
128 let v0 = _mm_loadu_ps(sp.add(i * 3)); let v1 = _mm_loadu_ps(sp.add(i * 3 + 4)); let v2 = _mm_loadu_ps(sp.add(i * 3 + 8)); let t0 = _mm_shuffle_ps(v0, v1, 0b01_00_11_00); let _r = _mm_shuffle_ps(t0, v2, 0b01_11_10_00); let r0r1 = _mm_shuffle_ps(v0, v0, 0b11_11_00_00); let r2r3 = _mm_shuffle_ps(v1, v2, 0b01_01_10_10); let r = _mm_shuffle_ps(r0r1, r2r3, 0b10_00_10_00); let g0g1 = _mm_shuffle_ps(v0, v1, 0b00_00_01_01); let g2g3 = _mm_shuffle_ps(v1, v2, 0b10_10_11_11); let g = _mm_shuffle_ps(g0g1, g2g3, 0b10_00_10_00); let b0b1 = _mm_shuffle_ps(v0, v1, 0b01_01_10_10); let b2b3 = _mm_shuffle_ps(v2, v2, 0b11_11_00_00); let b = _mm_shuffle_ps(b0b1, b2b3, 0b10_00_10_00); let gray = _mm_add_ps(
154 _mm_add_ps(_mm_mul_ps(r, coeff_r), _mm_mul_ps(g, coeff_g)),
155 _mm_mul_ps(b, coeff_b),
156 );
157 _mm_storeu_ps(dp.add(i), gray);
158 i += 4;
159 }
160 i
161}
162
163pub fn gaussian_blur_3x3_f32(input: &ImageF32) -> Option<ImageF32> {
168 if input.channels() != 1 {
169 return None;
170 }
171 let (h, w) = (input.height(), input.width());
172 if h < 3 || w < 3 {
173 return Some(input.clone());
174 }
175 let src = input.data();
176
177 let mut out = vec![0.0f32; h * w];
178
179 #[cfg(target_arch = "aarch64")]
180 if !cfg!(miri) && std::arch::is_aarch64_feature_detected!("neon") {
181 unsafe {
182 gauss_3x3_direct_f32_neon(src, &mut out, h, w);
183 }
184 return ImageF32::new(out, h, w, 1);
185 }
186
187 #[cfg(target_arch = "x86_64")]
188 if !cfg!(miri) && is_x86_feature_detected!("sse2") {
189 unsafe {
190 gauss_3x3_direct_f32_sse(src, &mut out, h, w);
191 }
192 return ImageF32::new(out, h, w, 1);
193 }
194
195 for y in 0..h {
197 let ay = if y > 0 { y - 1 } else { 0 };
198 let by = if y + 1 < h { y + 1 } else { h - 1 };
199 let top = &src[ay * w..(ay + 1) * w];
200 let mid = &src[y * w..(y + 1) * w];
201 let bot = &src[by * w..(by + 1) * w];
202 let dst = &mut out[y * w..(y + 1) * w];
203 for x in 0..w {
204 let lx = if x > 0 { x - 1 } else { 0 };
205 let rx = if x + 1 < w { x + 1 } else { w - 1 };
206 let v = top[lx]
207 + 2.0 * top[x]
208 + top[rx]
209 + 2.0 * (mid[lx] + 2.0 * mid[x] + mid[rx])
210 + bot[lx]
211 + 2.0 * bot[x]
212 + bot[rx];
213 dst[x] = v / 16.0;
214 }
215 }
216 ImageF32::new(out, h, w, 1)
217}
218
219#[cfg(target_arch = "aarch64")]
220#[target_feature(enable = "neon")]
221#[allow(unsafe_op_in_unsafe_fn)]
222unsafe fn gauss_row_f32_neon(
223 top: *const f32,
224 mid: *const f32,
225 bot: *const f32,
226 dst: *mut f32,
227 w: usize,
228) {
229 use std::arch::aarch64::*;
230 let inv16 = vdupq_n_f32(1.0 / 16.0);
231 let two = vdupq_n_f32(2.0);
232
233 let border_t = vdupq_n_f32(*top);
234 let border_m = vdupq_n_f32(*mid);
235 let border_b = vdupq_n_f32(*bot);
236 let mut prev_t = border_t;
237 let mut prev_m = border_m;
238 let mut prev_b = border_b;
239 let mut cur_t = vld1q_f32(top);
240 let mut cur_m = vld1q_f32(mid);
241 let mut cur_b = vld1q_f32(bot);
242
243 let mut x = 0usize;
244 let main_end = w.saturating_sub(4);
245
246 while x < main_end {
247 let nxt_t = vld1q_f32(top.add(x + 4));
248 let nxt_m = vld1q_f32(mid.add(x + 4));
249 let nxt_b = vld1q_f32(bot.add(x + 4));
250
251 let left_t = vextq_f32::<3>(prev_t, cur_t);
253 let right_t = vextq_f32::<1>(cur_t, nxt_t);
254 let ht = vfmaq_f32(vaddq_f32(left_t, right_t), cur_t, two);
255
256 let left_m = vextq_f32::<3>(prev_m, cur_m);
257 let right_m = vextq_f32::<1>(cur_m, nxt_m);
258 let hm = vfmaq_f32(vaddq_f32(left_m, right_m), cur_m, two);
259
260 let left_b = vextq_f32::<3>(prev_b, cur_b);
261 let right_b = vextq_f32::<1>(cur_b, nxt_b);
262 let hb = vfmaq_f32(vaddq_f32(left_b, right_b), cur_b, two);
263
264 let v = vfmaq_f32(vaddq_f32(ht, hb), hm, two);
266 vst1q_f32(dst.add(x), vmulq_f32(v, inv16));
267
268 prev_t = cur_t;
269 cur_t = nxt_t;
270 prev_m = cur_m;
271 cur_m = nxt_m;
272 prev_b = cur_b;
273 cur_b = nxt_b;
274 x += 4;
275 }
276
277 if x < w {
279 let border_t_end = vdupq_n_f32(*top.add(w - 1));
280 let border_m_end = vdupq_n_f32(*mid.add(w - 1));
281 let border_b_end = vdupq_n_f32(*bot.add(w - 1));
282
283 let left_t = vextq_f32::<3>(prev_t, cur_t);
284 let right_t = vextq_f32::<1>(cur_t, border_t_end);
285 let ht = vfmaq_f32(vaddq_f32(left_t, right_t), cur_t, two);
286
287 let left_m = vextq_f32::<3>(prev_m, cur_m);
288 let right_m = vextq_f32::<1>(cur_m, border_m_end);
289 let hm = vfmaq_f32(vaddq_f32(left_m, right_m), cur_m, two);
290
291 let left_b = vextq_f32::<3>(prev_b, cur_b);
292 let right_b = vextq_f32::<1>(cur_b, border_b_end);
293 let hb = vfmaq_f32(vaddq_f32(left_b, right_b), cur_b, two);
294
295 let v = vfmaq_f32(vaddq_f32(ht, hb), hm, two);
296 vst1q_f32(dst.add(x), vmulq_f32(v, inv16));
297 }
298}
299
300#[cfg(target_arch = "aarch64")]
301#[target_feature(enable = "neon")]
302#[allow(unsafe_op_in_unsafe_fn)]
303unsafe fn gauss_3x3_direct_f32_neon(src: &[f32], out: &mut [f32], h: usize, w: usize) {
304 let sp = src.as_ptr();
305 let dp = out.as_mut_ptr();
306
307 #[cfg(target_os = "macos")]
308 if h * w >= RAYON_THRESHOLD {
309 let sp = sp as usize;
310 let dp = dp as usize;
311 gcd::parallel_for(h, |y| {
312 let sp = sp as *const f32;
313 let dp = dp as *mut f32;
314 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
315 let mid = sp.add(y * w);
316 let bot = if y + 1 < h {
317 sp.add((y + 1) * w)
318 } else {
319 sp.add((h - 1) * w)
320 };
321 gauss_row_f32_neon(top, mid, bot, dp.add(y * w), w);
322 });
323 return;
324 }
325
326 if h * w >= RAYON_THRESHOLD {
327 let sp = sp as usize;
328 let dp = dp as usize;
329 (0..h).into_par_iter().for_each(|y| {
330 let sp = sp as *const f32;
331 let dp = dp as *mut f32;
332 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
333 let mid = sp.add(y * w);
334 let bot = if y + 1 < h {
335 sp.add((y + 1) * w)
336 } else {
337 sp.add((h - 1) * w)
338 };
339 gauss_row_f32_neon(top, mid, bot, dp.add(y * w), w);
340 });
341 return;
342 }
343
344 gauss_row_f32_neon(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
345 for y in 1..h.saturating_sub(1) {
346 gauss_row_f32_neon(
347 sp.add((y - 1) * w),
348 sp.add(y * w),
349 sp.add((y + 1) * w),
350 dp.add(y * w),
351 w,
352 );
353 }
354 if h > 1 {
355 gauss_row_f32_neon(
356 sp.add((h - 2) * w),
357 sp.add((h - 1) * w),
358 sp.add((h - 1) * w),
359 dp.add((h - 1) * w),
360 w,
361 );
362 }
363}
364
365#[cfg(target_arch = "x86_64")]
366#[target_feature(enable = "sse2")]
367#[allow(unsafe_op_in_unsafe_fn)]
368unsafe fn gauss_row_f32_sse(
369 top: *const f32,
370 mid: *const f32,
371 bot: *const f32,
372 dst: *mut f32,
373 w: usize,
374) {
375 use std::arch::x86_64::*;
376 let inv16 = _mm_set1_ps(1.0 / 16.0);
377 let two = _mm_set1_ps(2.0);
378
379 let mut x = 0usize;
380 if w >= 6 {
382 x = 1;
384 while x + 4 <= w.saturating_sub(1) {
385 let t_l = _mm_loadu_ps(top.add(x - 1));
386 let t_c = _mm_loadu_ps(top.add(x));
387 let t_r = _mm_loadu_ps(top.add(x + 1));
388 let ht = _mm_add_ps(_mm_add_ps(t_l, t_r), _mm_mul_ps(t_c, two));
389
390 let m_l = _mm_loadu_ps(mid.add(x - 1));
391 let m_c = _mm_loadu_ps(mid.add(x));
392 let m_r = _mm_loadu_ps(mid.add(x + 1));
393 let hm = _mm_add_ps(_mm_add_ps(m_l, m_r), _mm_mul_ps(m_c, two));
394
395 let b_l = _mm_loadu_ps(bot.add(x - 1));
396 let b_c = _mm_loadu_ps(bot.add(x));
397 let b_r = _mm_loadu_ps(bot.add(x + 1));
398 let hb = _mm_add_ps(_mm_add_ps(b_l, b_r), _mm_mul_ps(b_c, two));
399
400 let v = _mm_add_ps(_mm_add_ps(ht, hb), _mm_mul_ps(hm, two));
401 _mm_storeu_ps(dst.add(x), _mm_mul_ps(v, inv16));
402 x += 4;
403 }
404 }
405 let border_start = if x == 0 { 0 } else { x };
407 for xx in border_start..w {
408 let lx = if xx > 0 { xx - 1 } else { 0 };
409 let rx = if xx + 1 < w { xx + 1 } else { w - 1 };
410 let v = *top.add(lx)
411 + 2.0 * *top.add(xx)
412 + *top.add(rx)
413 + 2.0 * (*mid.add(lx) + 2.0 * *mid.add(xx) + *mid.add(rx))
414 + *bot.add(lx)
415 + 2.0 * *bot.add(xx)
416 + *bot.add(rx);
417 *dst.add(xx) = v / 16.0;
418 }
419 if border_start == 1 {
421 let lx = 0usize;
422 let rx = 1usize.min(w - 1);
423 let v = *top.add(lx)
424 + 2.0 * *top.add(0)
425 + *top.add(rx)
426 + 2.0 * (*mid.add(lx) + 2.0 * *mid.add(0) + *mid.add(rx))
427 + *bot.add(lx)
428 + 2.0 * *bot.add(0)
429 + *bot.add(rx);
430 *dst.add(0) = v / 16.0;
431 }
432}
433
434#[cfg(target_arch = "x86_64")]
435#[target_feature(enable = "sse2")]
436#[allow(unsafe_op_in_unsafe_fn)]
437unsafe fn gauss_3x3_direct_f32_sse(src: &[f32], out: &mut [f32], h: usize, w: usize) {
438 let sp = src.as_ptr();
439 let dp = out.as_mut_ptr();
440
441 if h * w >= RAYON_THRESHOLD {
442 let sp = sp as usize;
443 let dp = dp as usize;
444 (0..h).into_par_iter().for_each(|y| {
445 let sp = sp as *const f32;
446 let dp = dp as *mut f32;
447 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
448 let mid = sp.add(y * w);
449 let bot = if y + 1 < h {
450 sp.add((y + 1) * w)
451 } else {
452 sp.add((h - 1) * w)
453 };
454 gauss_row_f32_sse(top, mid, bot, dp.add(y * w), w);
455 });
456 return;
457 }
458
459 gauss_row_f32_sse(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
460 for y in 1..h.saturating_sub(1) {
461 gauss_row_f32_sse(
462 sp.add((y - 1) * w),
463 sp.add(y * w),
464 sp.add((y + 1) * w),
465 dp.add(y * w),
466 w,
467 );
468 }
469 if h > 1 {
470 gauss_row_f32_sse(
471 sp.add((h - 2) * w),
472 sp.add((h - 1) * w),
473 sp.add((h - 1) * w),
474 dp.add((h - 1) * w),
475 w,
476 );
477 }
478}
479
480pub fn box_blur_3x3_f32(input: &ImageF32) -> Option<ImageF32> {
485 if input.channels() != 1 {
486 return None;
487 }
488 let (h, w) = (input.height(), input.width());
489 if h < 3 || w < 3 {
490 return Some(input.clone());
491 }
492 let src = input.data();
493
494 let mut out = vec![0.0f32; h * w];
495
496 #[cfg(target_arch = "aarch64")]
497 if !cfg!(miri) && std::arch::is_aarch64_feature_detected!("neon") {
498 unsafe {
499 box_3x3_direct_f32_neon(src, &mut out, h, w);
500 }
501 return ImageF32::new(out, h, w, 1);
502 }
503
504 #[cfg(target_arch = "x86_64")]
505 if !cfg!(miri) && is_x86_feature_detected!("sse2") {
506 unsafe {
507 box_3x3_direct_f32_sse(src, &mut out, h, w);
508 }
509 return ImageF32::new(out, h, w, 1);
510 }
511
512 let inv9 = 1.0f32 / 9.0;
514 for y in 0..h {
515 let ay = if y > 0 { y - 1 } else { 0 };
516 let by = if y + 1 < h { y + 1 } else { h - 1 };
517 let top = &src[ay * w..(ay + 1) * w];
518 let mid = &src[y * w..(y + 1) * w];
519 let bot = &src[by * w..(by + 1) * w];
520 let dst = &mut out[y * w..(y + 1) * w];
521 for x in 0..w {
522 let lx = if x > 0 { x - 1 } else { 0 };
523 let rx = if x + 1 < w { x + 1 } else { w - 1 };
524 let v = top[lx]
525 + top[x]
526 + top[rx]
527 + mid[lx]
528 + mid[x]
529 + mid[rx]
530 + bot[lx]
531 + bot[x]
532 + bot[rx];
533 dst[x] = v * inv9;
534 }
535 }
536 ImageF32::new(out, h, w, 1)
537}
538
539#[cfg(target_arch = "aarch64")]
540#[target_feature(enable = "neon")]
541#[allow(unsafe_op_in_unsafe_fn)]
542unsafe fn box_row_f32_neon(
543 top: *const f32,
544 mid: *const f32,
545 bot: *const f32,
546 dst: *mut f32,
547 w: usize,
548) {
549 use std::arch::aarch64::*;
550 let inv9 = vdupq_n_f32(1.0 / 9.0);
551
552 let border_t = vdupq_n_f32(*top);
553 let border_m = vdupq_n_f32(*mid);
554 let border_b = vdupq_n_f32(*bot);
555 let mut prev_t = border_t;
556 let mut prev_m = border_m;
557 let mut prev_b = border_b;
558 let mut cur_t = vld1q_f32(top);
559 let mut cur_m = vld1q_f32(mid);
560 let mut cur_b = vld1q_f32(bot);
561
562 let mut x = 0usize;
563 let main_end = w.saturating_sub(4);
564
565 while x < main_end {
566 let nxt_t = vld1q_f32(top.add(x + 4));
567 let nxt_m = vld1q_f32(mid.add(x + 4));
568 let nxt_b = vld1q_f32(bot.add(x + 4));
569
570 let ht = vaddq_f32(
571 vaddq_f32(vextq_f32::<3>(prev_t, cur_t), cur_t),
572 vextq_f32::<1>(cur_t, nxt_t),
573 );
574 let hm = vaddq_f32(
575 vaddq_f32(vextq_f32::<3>(prev_m, cur_m), cur_m),
576 vextq_f32::<1>(cur_m, nxt_m),
577 );
578 let hb = vaddq_f32(
579 vaddq_f32(vextq_f32::<3>(prev_b, cur_b), cur_b),
580 vextq_f32::<1>(cur_b, nxt_b),
581 );
582
583 let v = vaddq_f32(vaddq_f32(ht, hm), hb);
584 vst1q_f32(dst.add(x), vmulq_f32(v, inv9));
585
586 prev_t = cur_t;
587 cur_t = nxt_t;
588 prev_m = cur_m;
589 cur_m = nxt_m;
590 prev_b = cur_b;
591 cur_b = nxt_b;
592 x += 4;
593 }
594 if x < w {
595 let border_t_end = vdupq_n_f32(*top.add(w - 1));
596 let border_m_end = vdupq_n_f32(*mid.add(w - 1));
597 let border_b_end = vdupq_n_f32(*bot.add(w - 1));
598
599 let ht = vaddq_f32(
600 vaddq_f32(vextq_f32::<3>(prev_t, cur_t), cur_t),
601 vextq_f32::<1>(cur_t, border_t_end),
602 );
603 let hm = vaddq_f32(
604 vaddq_f32(vextq_f32::<3>(prev_m, cur_m), cur_m),
605 vextq_f32::<1>(cur_m, border_m_end),
606 );
607 let hb = vaddq_f32(
608 vaddq_f32(vextq_f32::<3>(prev_b, cur_b), cur_b),
609 vextq_f32::<1>(cur_b, border_b_end),
610 );
611
612 let v = vaddq_f32(vaddq_f32(ht, hm), hb);
613 vst1q_f32(dst.add(x), vmulq_f32(v, inv9));
614 }
615}
616
617#[cfg(target_arch = "aarch64")]
618#[target_feature(enable = "neon")]
619#[allow(unsafe_op_in_unsafe_fn)]
620unsafe fn box_3x3_direct_f32_neon(src: &[f32], out: &mut [f32], h: usize, w: usize) {
621 let sp = src.as_ptr();
622 let dp = out.as_mut_ptr();
623
624 #[cfg(target_os = "macos")]
625 if h * w >= RAYON_THRESHOLD {
626 let sp = sp as usize;
627 let dp = dp as usize;
628 gcd::parallel_for(h, |y| {
629 let sp = sp as *const f32;
630 let dp = dp as *mut f32;
631 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
632 let mid = sp.add(y * w);
633 let bot = if y + 1 < h {
634 sp.add((y + 1) * w)
635 } else {
636 sp.add((h - 1) * w)
637 };
638 box_row_f32_neon(top, mid, bot, dp.add(y * w), w);
639 });
640 return;
641 }
642
643 if h * w >= RAYON_THRESHOLD {
644 let sp = sp as usize;
645 let dp = dp as usize;
646 (0..h).into_par_iter().for_each(|y| {
647 let sp = sp as *const f32;
648 let dp = dp as *mut f32;
649 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
650 let mid = sp.add(y * w);
651 let bot = if y + 1 < h {
652 sp.add((y + 1) * w)
653 } else {
654 sp.add((h - 1) * w)
655 };
656 box_row_f32_neon(top, mid, bot, dp.add(y * w), w);
657 });
658 return;
659 }
660
661 box_row_f32_neon(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
662 for y in 1..h.saturating_sub(1) {
663 box_row_f32_neon(
664 sp.add((y - 1) * w),
665 sp.add(y * w),
666 sp.add((y + 1) * w),
667 dp.add(y * w),
668 w,
669 );
670 }
671 if h > 1 {
672 box_row_f32_neon(
673 sp.add((h - 2) * w),
674 sp.add((h - 1) * w),
675 sp.add((h - 1) * w),
676 dp.add((h - 1) * w),
677 w,
678 );
679 }
680}
681
682#[cfg(target_arch = "x86_64")]
683#[target_feature(enable = "sse2")]
684#[allow(unsafe_op_in_unsafe_fn)]
685unsafe fn box_row_f32_sse(
686 top: *const f32,
687 mid: *const f32,
688 bot: *const f32,
689 dst: *mut f32,
690 w: usize,
691) {
692 use std::arch::x86_64::*;
693 let inv9 = _mm_set1_ps(1.0 / 9.0);
694
695 let mut x = 0usize;
696 if w >= 6 {
697 x = 1;
698 while x + 4 <= w.saturating_sub(1) {
699 let ht = _mm_add_ps(
700 _mm_add_ps(_mm_loadu_ps(top.add(x - 1)), _mm_loadu_ps(top.add(x))),
701 _mm_loadu_ps(top.add(x + 1)),
702 );
703 let hm = _mm_add_ps(
704 _mm_add_ps(_mm_loadu_ps(mid.add(x - 1)), _mm_loadu_ps(mid.add(x))),
705 _mm_loadu_ps(mid.add(x + 1)),
706 );
707 let hb = _mm_add_ps(
708 _mm_add_ps(_mm_loadu_ps(bot.add(x - 1)), _mm_loadu_ps(bot.add(x))),
709 _mm_loadu_ps(bot.add(x + 1)),
710 );
711 let v = _mm_add_ps(_mm_add_ps(ht, hm), hb);
712 _mm_storeu_ps(dst.add(x), _mm_mul_ps(v, inv9));
713 x += 4;
714 }
715 }
716 let border_start = if x == 0 { 0 } else { x };
717 for xx in border_start..w {
718 let lx = if xx > 0 { xx - 1 } else { 0 };
719 let rx = if xx + 1 < w { xx + 1 } else { w - 1 };
720 let v = *top.add(lx)
721 + *top.add(xx)
722 + *top.add(rx)
723 + *mid.add(lx)
724 + *mid.add(xx)
725 + *mid.add(rx)
726 + *bot.add(lx)
727 + *bot.add(xx)
728 + *bot.add(rx);
729 *dst.add(xx) = v / 9.0;
730 }
731 if border_start == 1 {
732 let lx = 0usize;
733 let rx = 1usize.min(w - 1);
734 let v = *top.add(lx)
735 + *top.add(0)
736 + *top.add(rx)
737 + *mid.add(lx)
738 + *mid.add(0)
739 + *mid.add(rx)
740 + *bot.add(lx)
741 + *bot.add(0)
742 + *bot.add(rx);
743 *dst.add(0) = v / 9.0;
744 }
745}
746
747#[cfg(target_arch = "x86_64")]
748#[target_feature(enable = "sse2")]
749#[allow(unsafe_op_in_unsafe_fn)]
750unsafe fn box_3x3_direct_f32_sse(src: &[f32], out: &mut [f32], h: usize, w: usize) {
751 let sp = src.as_ptr();
752 let dp = out.as_mut_ptr();
753
754 if h * w >= RAYON_THRESHOLD {
755 let sp = sp as usize;
756 let dp = dp as usize;
757 (0..h).into_par_iter().for_each(|y| {
758 let sp = sp as *const f32;
759 let dp = dp as *mut f32;
760 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
761 let mid = sp.add(y * w);
762 let bot = if y + 1 < h {
763 sp.add((y + 1) * w)
764 } else {
765 sp.add((h - 1) * w)
766 };
767 box_row_f32_sse(top, mid, bot, dp.add(y * w), w);
768 });
769 return;
770 }
771
772 box_row_f32_sse(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
773 for y in 1..h.saturating_sub(1) {
774 box_row_f32_sse(
775 sp.add((y - 1) * w),
776 sp.add(y * w),
777 sp.add((y + 1) * w),
778 dp.add(y * w),
779 w,
780 );
781 }
782 if h > 1 {
783 box_row_f32_sse(
784 sp.add((h - 2) * w),
785 sp.add((h - 1) * w),
786 sp.add((h - 1) * w),
787 dp.add((h - 1) * w),
788 w,
789 );
790 }
791}
792
793pub fn dilate_3x3_f32(input: &ImageF32) -> Option<ImageF32> {
797 if input.channels() != 1 {
798 return None;
799 }
800 let (h, w) = (input.height(), input.width());
801 if h < 3 || w < 3 {
802 return Some(input.clone());
803 }
804 let src = input.data();
805
806 let mut out = vec![0.0f32; h * w];
807
808 #[cfg(target_arch = "aarch64")]
809 if !cfg!(miri) && std::arch::is_aarch64_feature_detected!("neon") {
810 unsafe {
811 dilate_3x3_direct_f32_neon(src, &mut out, h, w);
812 }
813 return ImageF32::new(out, h, w, 1);
814 }
815
816 #[cfg(target_arch = "x86_64")]
817 if !cfg!(miri) && is_x86_feature_detected!("sse2") {
818 unsafe {
819 dilate_3x3_direct_f32_sse(src, &mut out, h, w);
820 }
821 return ImageF32::new(out, h, w, 1);
822 }
823
824 let border = f32::NEG_INFINITY;
826 for y in 0..h {
827 let ay = if y > 0 { y - 1 } else { 0 };
828 let by = if y + 1 < h { y + 1 } else { h - 1 };
829 let top = &src[ay * w..(ay + 1) * w];
830 let mid = &src[y * w..(y + 1) * w];
831 let bot = &src[by * w..(by + 1) * w];
832 let dst = &mut out[y * w..(y + 1) * w];
833 for x in 0..w {
834 let lx = if x > 0 { x - 1 } else { 0 };
835 let rx = if x + 1 < w { x + 1 } else { w - 1 };
836 let mut mx = top[lx].max(top[x]).max(top[rx]);
837 mx = mx.max(mid[lx]).max(mid[x]).max(mid[rx]);
838 mx = mx.max(bot[lx]).max(bot[x]).max(bot[rx]);
839 dst[x] = mx;
840 }
841 }
842 let _ = border;
843 ImageF32::new(out, h, w, 1)
844}
845
846#[cfg(target_arch = "aarch64")]
847#[target_feature(enable = "neon")]
848#[allow(unsafe_op_in_unsafe_fn)]
849unsafe fn dilate_row_f32_neon(
850 top: *const f32,
851 mid: *const f32,
852 bot: *const f32,
853 dst: *mut f32,
854 w: usize,
855) {
856 use std::arch::aarch64::*;
857
858 let border_v = vdupq_n_f32(f32::NEG_INFINITY);
859 let mut prev_t = border_v;
860 let mut prev_m = border_v;
861 let mut prev_b = border_v;
862 let mut cur_t = vld1q_f32(top);
863 let mut cur_m = vld1q_f32(mid);
864 let mut cur_b = vld1q_f32(bot);
865
866 let mut x = 0usize;
867 let main_end = w.saturating_sub(4);
868 while x < main_end {
869 let nxt_t = vld1q_f32(top.add(x + 4));
870 let nxt_m = vld1q_f32(mid.add(x + 4));
871 let nxt_b = vld1q_f32(bot.add(x + 4));
872
873 let r0 = vmaxq_f32(
874 vmaxq_f32(vextq_f32::<3>(prev_t, cur_t), cur_t),
875 vextq_f32::<1>(cur_t, nxt_t),
876 );
877 let r1 = vmaxq_f32(
878 vmaxq_f32(vextq_f32::<3>(prev_m, cur_m), cur_m),
879 vextq_f32::<1>(cur_m, nxt_m),
880 );
881 let r2 = vmaxq_f32(
882 vmaxq_f32(vextq_f32::<3>(prev_b, cur_b), cur_b),
883 vextq_f32::<1>(cur_b, nxt_b),
884 );
885 vst1q_f32(dst.add(x), vmaxq_f32(vmaxq_f32(r0, r1), r2));
886
887 prev_t = cur_t;
888 cur_t = nxt_t;
889 prev_m = cur_m;
890 cur_m = nxt_m;
891 prev_b = cur_b;
892 cur_b = nxt_b;
893 x += 4;
894 }
895 if x < w {
896 let r0 = vmaxq_f32(
897 vmaxq_f32(vextq_f32::<3>(prev_t, cur_t), cur_t),
898 vextq_f32::<1>(cur_t, border_v),
899 );
900 let r1 = vmaxq_f32(
901 vmaxq_f32(vextq_f32::<3>(prev_m, cur_m), cur_m),
902 vextq_f32::<1>(cur_m, border_v),
903 );
904 let r2 = vmaxq_f32(
905 vmaxq_f32(vextq_f32::<3>(prev_b, cur_b), cur_b),
906 vextq_f32::<1>(cur_b, border_v),
907 );
908 vst1q_f32(dst.add(x), vmaxq_f32(vmaxq_f32(r0, r1), r2));
909 }
910}
911
912#[cfg(target_arch = "aarch64")]
913#[target_feature(enable = "neon")]
914#[allow(unsafe_op_in_unsafe_fn)]
915unsafe fn dilate_3x3_direct_f32_neon(src: &[f32], out: &mut [f32], h: usize, w: usize) {
916 let sp = src.as_ptr();
917 let dp = out.as_mut_ptr();
918 #[cfg(target_os = "macos")]
923 if h * w >= RAYON_THRESHOLD {
924 let sp = sp as usize;
925 let dp = dp as usize;
926 gcd::parallel_for(h, |y| {
927 let sp = sp as *const f32;
928 let dp = dp as *mut f32;
929 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
930 let mid = sp.add(y * w);
931 let bot = if y + 1 < h {
932 sp.add((y + 1) * w)
933 } else {
934 sp.add((h - 1) * w)
935 };
936 dilate_row_f32_neon(top, mid, bot, dp.add(y * w), w);
937 });
938 return;
939 }
940
941 if h * w >= RAYON_THRESHOLD {
942 let sp = sp as usize;
943 let dp = dp as usize;
944 (0..h).into_par_iter().for_each(|y| {
945 let sp = sp as *const f32;
946 let dp = dp as *mut f32;
947 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
948 let mid = sp.add(y * w);
949 let bot = if y + 1 < h {
950 sp.add((y + 1) * w)
951 } else {
952 sp.add((h - 1) * w)
953 };
954 dilate_row_f32_neon(top, mid, bot, dp.add(y * w), w);
955 });
956 return;
957 }
958
959 dilate_row_f32_neon(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
960 for y in 1..h.saturating_sub(1) {
961 dilate_row_f32_neon(
962 sp.add((y - 1) * w),
963 sp.add(y * w),
964 sp.add((y + 1) * w),
965 dp.add(y * w),
966 w,
967 );
968 }
969 if h > 1 {
970 dilate_row_f32_neon(
971 sp.add((h - 2) * w),
972 sp.add((h - 1) * w),
973 sp.add((h - 1) * w),
974 dp.add((h - 1) * w),
975 w,
976 );
977 }
978}
979
980#[cfg(target_arch = "x86_64")]
981#[target_feature(enable = "sse2")]
982#[allow(unsafe_op_in_unsafe_fn)]
983unsafe fn dilate_row_f32_sse(
984 top: *const f32,
985 mid: *const f32,
986 bot: *const f32,
987 dst: *mut f32,
988 w: usize,
989) {
990 use std::arch::x86_64::*;
991 let mut x = 0usize;
992 if w >= 6 {
993 x = 1;
994 while x + 4 <= w.saturating_sub(1) {
995 let r0 = _mm_max_ps(
996 _mm_max_ps(_mm_loadu_ps(top.add(x - 1)), _mm_loadu_ps(top.add(x))),
997 _mm_loadu_ps(top.add(x + 1)),
998 );
999 let r1 = _mm_max_ps(
1000 _mm_max_ps(_mm_loadu_ps(mid.add(x - 1)), _mm_loadu_ps(mid.add(x))),
1001 _mm_loadu_ps(mid.add(x + 1)),
1002 );
1003 let r2 = _mm_max_ps(
1004 _mm_max_ps(_mm_loadu_ps(bot.add(x - 1)), _mm_loadu_ps(bot.add(x))),
1005 _mm_loadu_ps(bot.add(x + 1)),
1006 );
1007 _mm_storeu_ps(dst.add(x), _mm_max_ps(_mm_max_ps(r0, r1), r2));
1008 x += 4;
1009 }
1010 }
1011 let border_start = if x == 0 { 0 } else { x };
1012 for xx in border_start..w {
1013 let lx = if xx > 0 { xx - 1 } else { 0 };
1014 let rx = if xx + 1 < w { xx + 1 } else { w - 1 };
1015 let mut mx = (*top.add(lx)).max(*top.add(xx)).max(*top.add(rx));
1016 mx = mx.max(*mid.add(lx)).max(*mid.add(xx)).max(*mid.add(rx));
1017 mx = mx.max(*bot.add(lx)).max(*bot.add(xx)).max(*bot.add(rx));
1018 *dst.add(xx) = mx;
1019 }
1020 if border_start == 1 {
1021 let lx = 0usize;
1022 let rx = 1usize.min(w - 1);
1023 let mut mx = (*top.add(lx)).max(*top.add(0)).max(*top.add(rx));
1024 mx = mx.max(*mid.add(lx)).max(*mid.add(0)).max(*mid.add(rx));
1025 mx = mx.max(*bot.add(lx)).max(*bot.add(0)).max(*bot.add(rx));
1026 *dst.add(0) = mx;
1027 }
1028}
1029
1030#[cfg(target_arch = "x86_64")]
1031#[target_feature(enable = "sse2")]
1032#[allow(unsafe_op_in_unsafe_fn)]
1033unsafe fn dilate_3x3_direct_f32_sse(src: &[f32], out: &mut [f32], h: usize, w: usize) {
1034 let sp = src.as_ptr();
1035 let dp = out.as_mut_ptr();
1036
1037 if h * w >= RAYON_THRESHOLD {
1038 let sp = sp as usize;
1039 let dp = dp as usize;
1040 (0..h).into_par_iter().for_each(|y| {
1041 let sp = sp as *const f32;
1042 let dp = dp as *mut f32;
1043 let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
1044 let mid = sp.add(y * w);
1045 let bot = if y + 1 < h {
1046 sp.add((y + 1) * w)
1047 } else {
1048 sp.add((h - 1) * w)
1049 };
1050 dilate_row_f32_sse(top, mid, bot, dp.add(y * w), w);
1051 });
1052 return;
1053 }
1054
1055 dilate_row_f32_sse(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
1056 for y in 1..h.saturating_sub(1) {
1057 dilate_row_f32_sse(
1058 sp.add((y - 1) * w),
1059 sp.add(y * w),
1060 sp.add((y + 1) * w),
1061 dp.add(y * w),
1062 w,
1063 );
1064 }
1065 if h > 1 {
1066 dilate_row_f32_sse(
1067 sp.add((h - 2) * w),
1068 sp.add((h - 1) * w),
1069 sp.add((h - 1) * w),
1070 dp.add((h - 1) * w),
1071 w,
1072 );
1073 }
1074}
1075
1076pub fn sobel_3x3_f32(input: &ImageF32) -> Option<ImageF32> {
1081 if input.channels() != 1 {
1082 return None;
1083 }
1084 let (h, w) = (input.height(), input.width());
1085 if h < 3 || w < 3 {
1086 return Some(ImageF32::zeros(h, w, 1));
1087 }
1088 let src = input.data();
1089
1090 let mut out = vec![0.0f32; h * w];
1091
1092 let use_rayon = h * w >= RAYON_THRESHOLD && !cfg!(miri);
1093 let interior_h = h - 2;
1094
1095 let process_row = |row0: &[f32], row1: &[f32], row2: &[f32], dst: &mut [f32]| {
1096 let done = sobel_f32_simd_row(row0, row1, row2, dst, w);
1097 for x in done..w - 1 {
1098 if x == 0 {
1099 continue;
1100 }
1101 let gx = row0[x + 1] - row0[x - 1] + 2.0 * (row1[x + 1] - row1[x - 1]) + row2[x + 1]
1102 - row2[x - 1];
1103 let gy = row2[x - 1] + 2.0 * row2[x] + row2[x + 1]
1104 - row0[x - 1]
1105 - 2.0 * row0[x]
1106 - row0[x + 1];
1107 dst[x] = gx.abs() + gy.abs();
1108 }
1109 };
1110
1111 if use_rayon && interior_h > 1 {
1112 out[w..w + interior_h * w]
1113 .par_chunks_mut(w)
1114 .enumerate()
1115 .for_each(|(i, dst)| {
1116 let y = i + 1;
1117 let row0 = &src[(y - 1) * w..y * w];
1118 let row1 = &src[y * w..(y + 1) * w];
1119 let row2 = &src[(y + 1) * w..(y + 2) * w];
1120 process_row(row0, row1, row2, dst);
1121 });
1122 } else {
1123 for y in 1..h - 1 {
1124 let row0 = &src[(y - 1) * w..y * w];
1125 let row1 = &src[y * w..(y + 1) * w];
1126 let row2 = &src[(y + 1) * w..(y + 2) * w];
1127 let dst = &mut out[y * w..(y + 1) * w];
1128 process_row(row0, row1, row2, dst);
1129 }
1130 }
1131
1132 ImageF32::new(out, h, w, 1)
1133}
1134
1135fn sobel_f32_simd_row(
1136 row0: &[f32],
1137 row1: &[f32],
1138 row2: &[f32],
1139 dst: &mut [f32],
1140 w: usize,
1141) -> usize {
1142 if cfg!(miri) || w < 6 {
1143 return 1;
1144 }
1145
1146 #[cfg(target_arch = "aarch64")]
1147 {
1148 if std::arch::is_aarch64_feature_detected!("neon") {
1149 return unsafe { sobel_f32_neon_row(row0, row1, row2, dst, w) };
1150 }
1151 }
1152 #[cfg(target_arch = "x86_64")]
1153 {
1154 if is_x86_feature_detected!("sse2") {
1155 return unsafe { sobel_f32_sse_row(row0, row1, row2, dst, w) };
1156 }
1157 }
1158 1
1159}
1160
1161#[cfg(target_arch = "aarch64")]
1162#[target_feature(enable = "neon")]
1163#[allow(unsafe_op_in_unsafe_fn)]
1164unsafe fn sobel_f32_neon_row(
1165 row0: &[f32],
1166 row1: &[f32],
1167 row2: &[f32],
1168 dst: &mut [f32],
1169 w: usize,
1170) -> usize {
1171 use std::arch::aarch64::*;
1172 let two = vdupq_n_f32(2.0);
1173 let p0 = row0.as_ptr();
1174 let p1 = row1.as_ptr();
1175 let p2 = row2.as_ptr();
1176 let dp = dst.as_mut_ptr();
1177
1178 let mut x = 1usize;
1179 while x + 4 <= w - 1 {
1180 let t_l = vld1q_f32(p0.add(x - 1));
1181 let t_c = vld1q_f32(p0.add(x));
1182 let t_r = vld1q_f32(p0.add(x + 1));
1183 let m_l = vld1q_f32(p1.add(x - 1));
1184 let m_r = vld1q_f32(p1.add(x + 1));
1185 let b_l = vld1q_f32(p2.add(x - 1));
1186 let b_c = vld1q_f32(p2.add(x));
1187 let b_r = vld1q_f32(p2.add(x + 1));
1188
1189 let gx = vaddq_f32(
1191 vaddq_f32(vsubq_f32(t_r, t_l), vsubq_f32(b_r, b_l)),
1192 vmulq_f32(vsubq_f32(m_r, m_l), two),
1193 );
1194 let gy = vsubq_f32(
1196 vfmaq_f32(vaddq_f32(b_l, b_r), b_c, two),
1197 vfmaq_f32(vaddq_f32(t_l, t_r), t_c, two),
1198 );
1199 let mag = vaddq_f32(vabsq_f32(gx), vabsq_f32(gy));
1200 vst1q_f32(dp.add(x), mag);
1201 x += 4;
1202 }
1203 x
1204}
1205
1206#[cfg(target_arch = "x86_64")]
1207#[target_feature(enable = "sse2")]
1208#[allow(unsafe_op_in_unsafe_fn)]
1209unsafe fn sobel_f32_sse_row(
1210 row0: &[f32],
1211 row1: &[f32],
1212 row2: &[f32],
1213 dst: &mut [f32],
1214 w: usize,
1215) -> usize {
1216 use std::arch::x86_64::*;
1217 let two = _mm_set1_ps(2.0);
1218 let sign_mask = _mm_set1_ps(-0.0); let p0 = row0.as_ptr();
1220 let p1 = row1.as_ptr();
1221 let p2 = row2.as_ptr();
1222 let dp = dst.as_mut_ptr();
1223
1224 let mut x = 1usize;
1225 while x + 4 <= w - 1 {
1226 let t_l = _mm_loadu_ps(p0.add(x - 1));
1227 let t_c = _mm_loadu_ps(p0.add(x));
1228 let t_r = _mm_loadu_ps(p0.add(x + 1));
1229 let m_l = _mm_loadu_ps(p1.add(x - 1));
1230 let m_r = _mm_loadu_ps(p1.add(x + 1));
1231 let b_l = _mm_loadu_ps(p2.add(x - 1));
1232 let b_c = _mm_loadu_ps(p2.add(x));
1233 let b_r = _mm_loadu_ps(p2.add(x + 1));
1234
1235 let gx = _mm_add_ps(
1236 _mm_add_ps(_mm_sub_ps(t_r, t_l), _mm_sub_ps(b_r, b_l)),
1237 _mm_mul_ps(_mm_sub_ps(m_r, m_l), two),
1238 );
1239 let gy = _mm_sub_ps(
1240 _mm_add_ps(_mm_add_ps(b_l, b_r), _mm_mul_ps(b_c, two)),
1241 _mm_add_ps(_mm_add_ps(t_l, t_r), _mm_mul_ps(t_c, two)),
1242 );
1243 let abs_gx = _mm_andnot_ps(sign_mask, gx);
1244 let abs_gy = _mm_andnot_ps(sign_mask, gy);
1245 _mm_storeu_ps(dp.add(x), _mm_add_ps(abs_gx, abs_gy));
1246 x += 4;
1247 }
1248 x
1249}
1250
1251#[allow(clippy::uninit_vec)]
1256pub fn threshold_binary_f32(input: &ImageF32, thresh: f32, max_val: f32) -> Option<ImageF32> {
1257 if input.channels() != 1 {
1258 return None;
1259 }
1260 let (h, w) = (input.height(), input.width());
1261 let src = input.data();
1262 let total = h * w;
1263
1264 let mut out = Vec::with_capacity(total);
1265 #[allow(unsafe_code)]
1266 unsafe {
1267 out.set_len(total);
1268 }
1269
1270 #[cfg(target_os = "macos")]
1272 if total >= RAYON_THRESHOLD && !cfg!(miri) {
1273 let n_chunks = 8usize.min(h);
1274 let chunk_h = h.div_ceil(n_chunks);
1275 let sp = src.as_ptr() as usize;
1276 let dp = out.as_mut_ptr() as usize;
1277 gcd::parallel_for(n_chunks, |chunk| {
1278 let sp = sp as *const f32;
1279 let dp = dp as *mut f32;
1280 let start = chunk * chunk_h * w;
1281 let end = (((chunk + 1) * chunk_h).min(h)) * w;
1282 let n = end - start;
1283 let chunk_src = unsafe { core::slice::from_raw_parts(sp.add(start), n) };
1284 let chunk_dst = unsafe { core::slice::from_raw_parts_mut(dp.add(start), n) };
1285 let done = threshold_f32_simd(chunk_src, chunk_dst, n, thresh, max_val);
1286 for i in done..n {
1287 chunk_dst[i] = if chunk_src[i] > thresh { max_val } else { 0.0 };
1288 }
1289 });
1290 return ImageF32::new(out, h, w, 1);
1291 }
1292
1293 let done = threshold_f32_simd(src, &mut out, total, thresh, max_val);
1294 for i in done..total {
1295 out[i] = if src[i] > thresh { max_val } else { 0.0 };
1296 }
1297
1298 ImageF32::new(out, h, w, 1)
1299}
1300
1301fn threshold_f32_simd(
1302 src: &[f32],
1303 dst: &mut [f32],
1304 total: usize,
1305 thresh: f32,
1306 max_val: f32,
1307) -> usize {
1308 if cfg!(miri) || total < 4 {
1309 return 0;
1310 }
1311
1312 #[cfg(target_arch = "aarch64")]
1313 {
1314 if std::arch::is_aarch64_feature_detected!("neon") {
1315 return unsafe { threshold_f32_neon(src, dst, total, thresh, max_val) };
1316 }
1317 }
1318 #[cfg(target_arch = "x86_64")]
1319 {
1320 if is_x86_feature_detected!("sse2") {
1321 return unsafe { threshold_f32_sse(src, dst, total, thresh, max_val) };
1322 }
1323 }
1324 0
1325}
1326
1327#[cfg(target_arch = "aarch64")]
1328#[target_feature(enable = "neon")]
1329#[allow(unsafe_op_in_unsafe_fn)]
1330unsafe fn threshold_f32_neon(
1331 src: &[f32],
1332 dst: &mut [f32],
1333 total: usize,
1334 thresh: f32,
1335 max_val: f32,
1336) -> usize {
1337 use std::arch::aarch64::*;
1338 let thresh_v = vdupq_n_f32(thresh);
1339 let max_v = vdupq_n_f32(max_val);
1340 let zero = vdupq_n_f32(0.0);
1341 let sp = src.as_ptr();
1342 let dp = dst.as_mut_ptr();
1343 let mut i = 0usize;
1344 while i + 4 <= total {
1345 let v = vld1q_f32(sp.add(i));
1346 let mask = vcgtq_f32(v, thresh_v); let result = vbslq_f32(mask, max_v, zero);
1348 vst1q_f32(dp.add(i), result);
1349 i += 4;
1350 }
1351 i
1352}
1353
1354#[cfg(target_arch = "x86_64")]
1355#[target_feature(enable = "sse2")]
1356#[allow(unsafe_op_in_unsafe_fn)]
1357unsafe fn threshold_f32_sse(
1358 src: &[f32],
1359 dst: &mut [f32],
1360 total: usize,
1361 thresh: f32,
1362 max_val: f32,
1363) -> usize {
1364 use std::arch::x86_64::*;
1365 let thresh_v = _mm_set1_ps(thresh);
1366 let max_v = _mm_set1_ps(max_val);
1367 let sp = src.as_ptr();
1368 let dp = dst.as_mut_ptr();
1369 let mut i = 0usize;
1370 while i + 4 <= total {
1371 let v = _mm_loadu_ps(sp.add(i));
1372 let mask = _mm_cmpgt_ps(v, thresh_v); let result = _mm_and_ps(mask, max_v);
1374 _mm_storeu_ps(dp.add(i), result);
1375 i += 4;
1376 }
1377 i
1378}