Skip to main content

yscv_imgproc/ops/
f32_ops.rs

1//! f32 image operations — direct slice processing, zero Tensor overhead.
2#![allow(unsafe_code)]
3
4#[cfg(target_os = "macos")]
5use super::u8ops::gcd;
6use super::u8ops::{ImageF32, RAYON_THRESHOLD};
7use rayon::prelude::*;
8
9// ============================================================================
10// f32 image operations — direct slice processing, zero Tensor overhead
11// ============================================================================
12
13// ── Grayscale f32 ──────────────────────────────────────────────────────────
14
15/// Converts RGB f32 image `[H,W,3]` to grayscale `[H,W,1]`.
16/// Uses BT.601: gray = 0.299*R + 0.587*G + 0.114*B.
17#[allow(clippy::uninit_vec)]
18pub fn grayscale_f32(input: &ImageF32) -> Option<ImageF32> {
19    if input.channels() != 3 {
20        return None;
21    }
22    let (h, w) = (input.height(), input.width());
23    let src = input.data();
24    let total = h * w;
25
26    // SAFETY: every element written by SIMD + scalar tail + GCD chunks.
27    let mut out = Vec::with_capacity(total);
28    #[allow(unsafe_code)]
29    unsafe {
30        out.set_len(total);
31    }
32
33    // GCD parallel in chunks (bandwidth-bound — coarse chunks reduce dispatch overhead)
34    #[cfg(target_os = "macos")]
35    if total >= RAYON_THRESHOLD && !cfg!(miri) {
36        let n_chunks = 8usize.min(h);
37        let chunk_h = h.div_ceil(n_chunks);
38        let sp = src.as_ptr() as usize;
39        let dp = out.as_mut_ptr() as usize;
40        gcd::parallel_for(n_chunks, |chunk| {
41            let sp = sp as *const f32;
42            let dp = dp as *mut f32;
43            let y_start = chunk * chunk_h;
44            let y_end = ((chunk + 1) * chunk_h).min(h);
45            let n = (y_end - y_start) * w;
46            let src_off = y_start * w * 3;
47            let dst_off = y_start * w;
48            let chunk_src = unsafe { core::slice::from_raw_parts(sp.add(src_off), n * 3) };
49            let chunk_dst = unsafe { core::slice::from_raw_parts_mut(dp.add(dst_off), n) };
50            let done = grayscale_f32_simd(chunk_src, chunk_dst, n);
51            for i in done..n {
52                let base = i * 3;
53                chunk_dst[i] = 0.299 * chunk_src[base]
54                    + 0.587 * chunk_src[base + 1]
55                    + 0.114 * chunk_src[base + 2];
56            }
57        });
58        return ImageF32::new(out, h, w, 1);
59    }
60
61    // Flat single-threaded
62    let done = grayscale_f32_simd(src, &mut out, total);
63    for i in done..total {
64        let base = i * 3;
65        out[i] = 0.299 * src[base] + 0.587 * src[base + 1] + 0.114 * src[base + 2];
66    }
67
68    ImageF32::new(out, h, w, 1)
69}
70
71fn grayscale_f32_simd(src: &[f32], dst: &mut [f32], total: usize) -> usize {
72    if cfg!(miri) || total < 4 {
73        return 0;
74    }
75
76    #[cfg(target_arch = "aarch64")]
77    {
78        if std::arch::is_aarch64_feature_detected!("neon") {
79            return unsafe { grayscale_f32_neon(src, dst, total) };
80        }
81    }
82    #[cfg(target_arch = "x86_64")]
83    {
84        if is_x86_feature_detected!("sse2") {
85            return unsafe { grayscale_f32_sse(src, dst, total) };
86        }
87    }
88    0
89}
90
91#[cfg(target_arch = "aarch64")]
92#[target_feature(enable = "neon")]
93#[allow(unsafe_op_in_unsafe_fn)]
94unsafe fn grayscale_f32_neon(src: &[f32], dst: &mut [f32], total: usize) -> usize {
95    use std::arch::aarch64::*;
96    let coeff_r = vdupq_n_f32(0.299);
97    let coeff_g = vdupq_n_f32(0.587);
98    let coeff_b = vdupq_n_f32(0.114);
99    let sp = src.as_ptr();
100    let dp = dst.as_mut_ptr();
101    let mut i = 0usize;
102    while i + 4 <= total {
103        let rgb = vld3q_f32(sp.add(i * 3));
104        let gray = vfmaq_f32(
105            vfmaq_f32(vmulq_f32(rgb.0, coeff_r), rgb.1, coeff_g),
106            rgb.2,
107            coeff_b,
108        );
109        vst1q_f32(dp.add(i), gray);
110        i += 4;
111    }
112    i
113}
114
115#[cfg(target_arch = "x86_64")]
116#[target_feature(enable = "sse2")]
117#[allow(unsafe_op_in_unsafe_fn)]
118unsafe fn grayscale_f32_sse(src: &[f32], dst: &mut [f32], total: usize) -> usize {
119    use std::arch::x86_64::*;
120    let coeff_r = _mm_set1_ps(0.299);
121    let coeff_g = _mm_set1_ps(0.587);
122    let coeff_b = _mm_set1_ps(0.114);
123    let sp = src.as_ptr();
124    let dp = dst.as_mut_ptr();
125    let mut i = 0usize;
126    // Process 4 pixels: 12 floats from RGB, deinterleave manually
127    while i + 4 <= total {
128        let v0 = _mm_loadu_ps(sp.add(i * 3)); // R0 G0 B0 R1
129        let v1 = _mm_loadu_ps(sp.add(i * 3 + 4)); // G1 B1 R2 G2
130        let v2 = _mm_loadu_ps(sp.add(i * 3 + 8)); // B2 R3 G3 B3
131
132        // Deinterleave: R = [R0,R1,R2,R3], G = [G0,G1,G2,G3], B = [B0,B1,B2,B3]
133        // v0 = [R0,G0,B0,R1], v1 = [G1,B1,R2,G2], v2 = [B2,R3,G3,B3]
134        let t0 = _mm_shuffle_ps(v0, v1, 0b01_00_11_00); // [R0, R1, G1, B1] -> need [R0,R1,_,_]
135        let _r = _mm_shuffle_ps(t0, v2, 0b01_11_10_00); // [R0, R1, R2, R3] — actually let me do this properly
136
137        // Proper deinterleave for 3-channel f32:
138        // R0=v0[0], R1=v0[3], R2=v1[2], R3=v2[1]
139        // G0=v0[1], G1=v1[0], G2=v1[3], G3=v2[2]
140        // B0=v0[2], B1=v1[1], B2=v2[0], B3=v2[3]
141        let r0r1 = _mm_shuffle_ps(v0, v0, 0b11_11_00_00); // [R0,R0,R1,R1]
142        let r2r3 = _mm_shuffle_ps(v1, v2, 0b01_01_10_10); // [R2,R2,R3,R3]
143        let r = _mm_shuffle_ps(r0r1, r2r3, 0b10_00_10_00); // [R0,R1,R2,R3]
144
145        let g0g1 = _mm_shuffle_ps(v0, v1, 0b00_00_01_01); // [G0,G0,G1,G1]
146        let g2g3 = _mm_shuffle_ps(v1, v2, 0b10_10_11_11); // [G2,G2,G3,G3]
147        let g = _mm_shuffle_ps(g0g1, g2g3, 0b10_00_10_00); // [G0,G1,G2,G3]
148
149        let b0b1 = _mm_shuffle_ps(v0, v1, 0b01_01_10_10); // [B0,B0,B1,B1]
150        let b2b3 = _mm_shuffle_ps(v2, v2, 0b11_11_00_00); // [B2,B2,B3,B3]
151        let b = _mm_shuffle_ps(b0b1, b2b3, 0b10_00_10_00); // [B0,B1,B2,B3]
152
153        let gray = _mm_add_ps(
154            _mm_add_ps(_mm_mul_ps(r, coeff_r), _mm_mul_ps(g, coeff_g)),
155            _mm_mul_ps(b, coeff_b),
156        );
157        _mm_storeu_ps(dp.add(i), gray);
158        i += 4;
159    }
160    i
161}
162
163// ── Gaussian blur 3x3 f32 ─────────────────────────────────────────────────
164
165/// 3x3 Gaussian blur on single-channel f32 image.
166/// Direct `[1,2,1]`x`[1,2,1]`/16 kernel. Replicate border.
167pub fn gaussian_blur_3x3_f32(input: &ImageF32) -> Option<ImageF32> {
168    if input.channels() != 1 {
169        return None;
170    }
171    let (h, w) = (input.height(), input.width());
172    if h < 3 || w < 3 {
173        return Some(input.clone());
174    }
175    let src = input.data();
176
177    let mut out = vec![0.0f32; h * w];
178
179    #[cfg(target_arch = "aarch64")]
180    if !cfg!(miri) && std::arch::is_aarch64_feature_detected!("neon") {
181        unsafe {
182            gauss_3x3_direct_f32_neon(src, &mut out, h, w);
183        }
184        return ImageF32::new(out, h, w, 1);
185    }
186
187    #[cfg(target_arch = "x86_64")]
188    if !cfg!(miri) && is_x86_feature_detected!("sse2") {
189        unsafe {
190            gauss_3x3_direct_f32_sse(src, &mut out, h, w);
191        }
192        return ImageF32::new(out, h, w, 1);
193    }
194
195    // Scalar fallback
196    for y in 0..h {
197        let ay = if y > 0 { y - 1 } else { 0 };
198        let by = if y + 1 < h { y + 1 } else { h - 1 };
199        let top = &src[ay * w..(ay + 1) * w];
200        let mid = &src[y * w..(y + 1) * w];
201        let bot = &src[by * w..(by + 1) * w];
202        let dst = &mut out[y * w..(y + 1) * w];
203        for x in 0..w {
204            let lx = if x > 0 { x - 1 } else { 0 };
205            let rx = if x + 1 < w { x + 1 } else { w - 1 };
206            let v = top[lx]
207                + 2.0 * top[x]
208                + top[rx]
209                + 2.0 * (mid[lx] + 2.0 * mid[x] + mid[rx])
210                + bot[lx]
211                + 2.0 * bot[x]
212                + bot[rx];
213            dst[x] = v / 16.0;
214        }
215    }
216    ImageF32::new(out, h, w, 1)
217}
218
219#[cfg(target_arch = "aarch64")]
220#[target_feature(enable = "neon")]
221#[allow(unsafe_op_in_unsafe_fn)]
222unsafe fn gauss_row_f32_neon(
223    top: *const f32,
224    mid: *const f32,
225    bot: *const f32,
226    dst: *mut f32,
227    w: usize,
228) {
229    use std::arch::aarch64::*;
230    let inv16 = vdupq_n_f32(1.0 / 16.0);
231    let two = vdupq_n_f32(2.0);
232
233    let border_t = vdupq_n_f32(*top);
234    let border_m = vdupq_n_f32(*mid);
235    let border_b = vdupq_n_f32(*bot);
236    let mut prev_t = border_t;
237    let mut prev_m = border_m;
238    let mut prev_b = border_b;
239    let mut cur_t = vld1q_f32(top);
240    let mut cur_m = vld1q_f32(mid);
241    let mut cur_b = vld1q_f32(bot);
242
243    let mut x = 0usize;
244    let main_end = w.saturating_sub(4);
245
246    while x < main_end {
247        let nxt_t = vld1q_f32(top.add(x + 4));
248        let nxt_m = vld1q_f32(mid.add(x + 4));
249        let nxt_b = vld1q_f32(bot.add(x + 4));
250
251        // Horizontal: left + 2*center + right for each row
252        let left_t = vextq_f32::<3>(prev_t, cur_t);
253        let right_t = vextq_f32::<1>(cur_t, nxt_t);
254        let ht = vfmaq_f32(vaddq_f32(left_t, right_t), cur_t, two);
255
256        let left_m = vextq_f32::<3>(prev_m, cur_m);
257        let right_m = vextq_f32::<1>(cur_m, nxt_m);
258        let hm = vfmaq_f32(vaddq_f32(left_m, right_m), cur_m, two);
259
260        let left_b = vextq_f32::<3>(prev_b, cur_b);
261        let right_b = vextq_f32::<1>(cur_b, nxt_b);
262        let hb = vfmaq_f32(vaddq_f32(left_b, right_b), cur_b, two);
263
264        // Vertical: top + 2*mid + bot, then /16
265        let v = vfmaq_f32(vaddq_f32(ht, hb), hm, two);
266        vst1q_f32(dst.add(x), vmulq_f32(v, inv16));
267
268        prev_t = cur_t;
269        cur_t = nxt_t;
270        prev_m = cur_m;
271        cur_m = nxt_m;
272        prev_b = cur_b;
273        cur_b = nxt_b;
274        x += 4;
275    }
276
277    // Tail
278    if x < w {
279        let border_t_end = vdupq_n_f32(*top.add(w - 1));
280        let border_m_end = vdupq_n_f32(*mid.add(w - 1));
281        let border_b_end = vdupq_n_f32(*bot.add(w - 1));
282
283        let left_t = vextq_f32::<3>(prev_t, cur_t);
284        let right_t = vextq_f32::<1>(cur_t, border_t_end);
285        let ht = vfmaq_f32(vaddq_f32(left_t, right_t), cur_t, two);
286
287        let left_m = vextq_f32::<3>(prev_m, cur_m);
288        let right_m = vextq_f32::<1>(cur_m, border_m_end);
289        let hm = vfmaq_f32(vaddq_f32(left_m, right_m), cur_m, two);
290
291        let left_b = vextq_f32::<3>(prev_b, cur_b);
292        let right_b = vextq_f32::<1>(cur_b, border_b_end);
293        let hb = vfmaq_f32(vaddq_f32(left_b, right_b), cur_b, two);
294
295        let v = vfmaq_f32(vaddq_f32(ht, hb), hm, two);
296        vst1q_f32(dst.add(x), vmulq_f32(v, inv16));
297    }
298}
299
300#[cfg(target_arch = "aarch64")]
301#[target_feature(enable = "neon")]
302#[allow(unsafe_op_in_unsafe_fn)]
303unsafe fn gauss_3x3_direct_f32_neon(src: &[f32], out: &mut [f32], h: usize, w: usize) {
304    let sp = src.as_ptr();
305    let dp = out.as_mut_ptr();
306
307    #[cfg(target_os = "macos")]
308    if h * w >= RAYON_THRESHOLD {
309        let sp = sp as usize;
310        let dp = dp as usize;
311        gcd::parallel_for(h, |y| {
312            let sp = sp as *const f32;
313            let dp = dp as *mut f32;
314            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
315            let mid = sp.add(y * w);
316            let bot = if y + 1 < h {
317                sp.add((y + 1) * w)
318            } else {
319                sp.add((h - 1) * w)
320            };
321            gauss_row_f32_neon(top, mid, bot, dp.add(y * w), w);
322        });
323        return;
324    }
325
326    if h * w >= RAYON_THRESHOLD {
327        let sp = sp as usize;
328        let dp = dp as usize;
329        (0..h).into_par_iter().for_each(|y| {
330            let sp = sp as *const f32;
331            let dp = dp as *mut f32;
332            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
333            let mid = sp.add(y * w);
334            let bot = if y + 1 < h {
335                sp.add((y + 1) * w)
336            } else {
337                sp.add((h - 1) * w)
338            };
339            gauss_row_f32_neon(top, mid, bot, dp.add(y * w), w);
340        });
341        return;
342    }
343
344    gauss_row_f32_neon(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
345    for y in 1..h.saturating_sub(1) {
346        gauss_row_f32_neon(
347            sp.add((y - 1) * w),
348            sp.add(y * w),
349            sp.add((y + 1) * w),
350            dp.add(y * w),
351            w,
352        );
353    }
354    if h > 1 {
355        gauss_row_f32_neon(
356            sp.add((h - 2) * w),
357            sp.add((h - 1) * w),
358            sp.add((h - 1) * w),
359            dp.add((h - 1) * w),
360            w,
361        );
362    }
363}
364
365#[cfg(target_arch = "x86_64")]
366#[target_feature(enable = "sse2")]
367#[allow(unsafe_op_in_unsafe_fn)]
368unsafe fn gauss_row_f32_sse(
369    top: *const f32,
370    mid: *const f32,
371    bot: *const f32,
372    dst: *mut f32,
373    w: usize,
374) {
375    use std::arch::x86_64::*;
376    let inv16 = _mm_set1_ps(1.0 / 16.0);
377    let two = _mm_set1_ps(2.0);
378
379    let mut x = 0usize;
380    // Process interior pixels, scalar handles borders
381    if w >= 6 {
382        // First pixel (x=0) done by scalar
383        x = 1;
384        while x + 4 <= w.saturating_sub(1) {
385            let t_l = _mm_loadu_ps(top.add(x - 1));
386            let t_c = _mm_loadu_ps(top.add(x));
387            let t_r = _mm_loadu_ps(top.add(x + 1));
388            let ht = _mm_add_ps(_mm_add_ps(t_l, t_r), _mm_mul_ps(t_c, two));
389
390            let m_l = _mm_loadu_ps(mid.add(x - 1));
391            let m_c = _mm_loadu_ps(mid.add(x));
392            let m_r = _mm_loadu_ps(mid.add(x + 1));
393            let hm = _mm_add_ps(_mm_add_ps(m_l, m_r), _mm_mul_ps(m_c, two));
394
395            let b_l = _mm_loadu_ps(bot.add(x - 1));
396            let b_c = _mm_loadu_ps(bot.add(x));
397            let b_r = _mm_loadu_ps(bot.add(x + 1));
398            let hb = _mm_add_ps(_mm_add_ps(b_l, b_r), _mm_mul_ps(b_c, two));
399
400            let v = _mm_add_ps(_mm_add_ps(ht, hb), _mm_mul_ps(hm, two));
401            _mm_storeu_ps(dst.add(x), _mm_mul_ps(v, inv16));
402            x += 4;
403        }
404    }
405    // Scalar for remaining and border pixels
406    let border_start = if x == 0 { 0 } else { x };
407    for xx in border_start..w {
408        let lx = if xx > 0 { xx - 1 } else { 0 };
409        let rx = if xx + 1 < w { xx + 1 } else { w - 1 };
410        let v = *top.add(lx)
411            + 2.0 * *top.add(xx)
412            + *top.add(rx)
413            + 2.0 * (*mid.add(lx) + 2.0 * *mid.add(xx) + *mid.add(rx))
414            + *bot.add(lx)
415            + 2.0 * *bot.add(xx)
416            + *bot.add(rx);
417        *dst.add(xx) = v / 16.0;
418    }
419    // Also handle x=0 if we skipped it
420    if border_start == 1 {
421        let lx = 0usize;
422        let rx = 1usize.min(w - 1);
423        let v = *top.add(lx)
424            + 2.0 * *top.add(0)
425            + *top.add(rx)
426            + 2.0 * (*mid.add(lx) + 2.0 * *mid.add(0) + *mid.add(rx))
427            + *bot.add(lx)
428            + 2.0 * *bot.add(0)
429            + *bot.add(rx);
430        *dst.add(0) = v / 16.0;
431    }
432}
433
434#[cfg(target_arch = "x86_64")]
435#[target_feature(enable = "sse2")]
436#[allow(unsafe_op_in_unsafe_fn)]
437unsafe fn gauss_3x3_direct_f32_sse(src: &[f32], out: &mut [f32], h: usize, w: usize) {
438    let sp = src.as_ptr();
439    let dp = out.as_mut_ptr();
440
441    if h * w >= RAYON_THRESHOLD {
442        let sp = sp as usize;
443        let dp = dp as usize;
444        (0..h).into_par_iter().for_each(|y| {
445            let sp = sp as *const f32;
446            let dp = dp as *mut f32;
447            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
448            let mid = sp.add(y * w);
449            let bot = if y + 1 < h {
450                sp.add((y + 1) * w)
451            } else {
452                sp.add((h - 1) * w)
453            };
454            gauss_row_f32_sse(top, mid, bot, dp.add(y * w), w);
455        });
456        return;
457    }
458
459    gauss_row_f32_sse(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
460    for y in 1..h.saturating_sub(1) {
461        gauss_row_f32_sse(
462            sp.add((y - 1) * w),
463            sp.add(y * w),
464            sp.add((y + 1) * w),
465            dp.add(y * w),
466            w,
467        );
468    }
469    if h > 1 {
470        gauss_row_f32_sse(
471            sp.add((h - 2) * w),
472            sp.add((h - 1) * w),
473            sp.add((h - 1) * w),
474            dp.add((h - 1) * w),
475            w,
476        );
477    }
478}
479
480// ── Box blur 3x3 f32 ──────────────────────────────────────────────────────
481
482/// 3x3 box blur on single-channel f32 image.
483/// Direct `[1,1,1]`x`[1,1,1]`/9 kernel. Replicate border.
484pub fn box_blur_3x3_f32(input: &ImageF32) -> Option<ImageF32> {
485    if input.channels() != 1 {
486        return None;
487    }
488    let (h, w) = (input.height(), input.width());
489    if h < 3 || w < 3 {
490        return Some(input.clone());
491    }
492    let src = input.data();
493
494    let mut out = vec![0.0f32; h * w];
495
496    #[cfg(target_arch = "aarch64")]
497    if !cfg!(miri) && std::arch::is_aarch64_feature_detected!("neon") {
498        unsafe {
499            box_3x3_direct_f32_neon(src, &mut out, h, w);
500        }
501        return ImageF32::new(out, h, w, 1);
502    }
503
504    #[cfg(target_arch = "x86_64")]
505    if !cfg!(miri) && is_x86_feature_detected!("sse2") {
506        unsafe {
507            box_3x3_direct_f32_sse(src, &mut out, h, w);
508        }
509        return ImageF32::new(out, h, w, 1);
510    }
511
512    // Scalar fallback
513    let inv9 = 1.0f32 / 9.0;
514    for y in 0..h {
515        let ay = if y > 0 { y - 1 } else { 0 };
516        let by = if y + 1 < h { y + 1 } else { h - 1 };
517        let top = &src[ay * w..(ay + 1) * w];
518        let mid = &src[y * w..(y + 1) * w];
519        let bot = &src[by * w..(by + 1) * w];
520        let dst = &mut out[y * w..(y + 1) * w];
521        for x in 0..w {
522            let lx = if x > 0 { x - 1 } else { 0 };
523            let rx = if x + 1 < w { x + 1 } else { w - 1 };
524            let v = top[lx]
525                + top[x]
526                + top[rx]
527                + mid[lx]
528                + mid[x]
529                + mid[rx]
530                + bot[lx]
531                + bot[x]
532                + bot[rx];
533            dst[x] = v * inv9;
534        }
535    }
536    ImageF32::new(out, h, w, 1)
537}
538
539#[cfg(target_arch = "aarch64")]
540#[target_feature(enable = "neon")]
541#[allow(unsafe_op_in_unsafe_fn)]
542unsafe fn box_row_f32_neon(
543    top: *const f32,
544    mid: *const f32,
545    bot: *const f32,
546    dst: *mut f32,
547    w: usize,
548) {
549    use std::arch::aarch64::*;
550    let inv9 = vdupq_n_f32(1.0 / 9.0);
551
552    let border_t = vdupq_n_f32(*top);
553    let border_m = vdupq_n_f32(*mid);
554    let border_b = vdupq_n_f32(*bot);
555    let mut prev_t = border_t;
556    let mut prev_m = border_m;
557    let mut prev_b = border_b;
558    let mut cur_t = vld1q_f32(top);
559    let mut cur_m = vld1q_f32(mid);
560    let mut cur_b = vld1q_f32(bot);
561
562    let mut x = 0usize;
563    let main_end = w.saturating_sub(4);
564
565    while x < main_end {
566        let nxt_t = vld1q_f32(top.add(x + 4));
567        let nxt_m = vld1q_f32(mid.add(x + 4));
568        let nxt_b = vld1q_f32(bot.add(x + 4));
569
570        let ht = vaddq_f32(
571            vaddq_f32(vextq_f32::<3>(prev_t, cur_t), cur_t),
572            vextq_f32::<1>(cur_t, nxt_t),
573        );
574        let hm = vaddq_f32(
575            vaddq_f32(vextq_f32::<3>(prev_m, cur_m), cur_m),
576            vextq_f32::<1>(cur_m, nxt_m),
577        );
578        let hb = vaddq_f32(
579            vaddq_f32(vextq_f32::<3>(prev_b, cur_b), cur_b),
580            vextq_f32::<1>(cur_b, nxt_b),
581        );
582
583        let v = vaddq_f32(vaddq_f32(ht, hm), hb);
584        vst1q_f32(dst.add(x), vmulq_f32(v, inv9));
585
586        prev_t = cur_t;
587        cur_t = nxt_t;
588        prev_m = cur_m;
589        cur_m = nxt_m;
590        prev_b = cur_b;
591        cur_b = nxt_b;
592        x += 4;
593    }
594    if x < w {
595        let border_t_end = vdupq_n_f32(*top.add(w - 1));
596        let border_m_end = vdupq_n_f32(*mid.add(w - 1));
597        let border_b_end = vdupq_n_f32(*bot.add(w - 1));
598
599        let ht = vaddq_f32(
600            vaddq_f32(vextq_f32::<3>(prev_t, cur_t), cur_t),
601            vextq_f32::<1>(cur_t, border_t_end),
602        );
603        let hm = vaddq_f32(
604            vaddq_f32(vextq_f32::<3>(prev_m, cur_m), cur_m),
605            vextq_f32::<1>(cur_m, border_m_end),
606        );
607        let hb = vaddq_f32(
608            vaddq_f32(vextq_f32::<3>(prev_b, cur_b), cur_b),
609            vextq_f32::<1>(cur_b, border_b_end),
610        );
611
612        let v = vaddq_f32(vaddq_f32(ht, hm), hb);
613        vst1q_f32(dst.add(x), vmulq_f32(v, inv9));
614    }
615}
616
617#[cfg(target_arch = "aarch64")]
618#[target_feature(enable = "neon")]
619#[allow(unsafe_op_in_unsafe_fn)]
620unsafe fn box_3x3_direct_f32_neon(src: &[f32], out: &mut [f32], h: usize, w: usize) {
621    let sp = src.as_ptr();
622    let dp = out.as_mut_ptr();
623
624    #[cfg(target_os = "macos")]
625    if h * w >= RAYON_THRESHOLD {
626        let sp = sp as usize;
627        let dp = dp as usize;
628        gcd::parallel_for(h, |y| {
629            let sp = sp as *const f32;
630            let dp = dp as *mut f32;
631            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
632            let mid = sp.add(y * w);
633            let bot = if y + 1 < h {
634                sp.add((y + 1) * w)
635            } else {
636                sp.add((h - 1) * w)
637            };
638            box_row_f32_neon(top, mid, bot, dp.add(y * w), w);
639        });
640        return;
641    }
642
643    if h * w >= RAYON_THRESHOLD {
644        let sp = sp as usize;
645        let dp = dp as usize;
646        (0..h).into_par_iter().for_each(|y| {
647            let sp = sp as *const f32;
648            let dp = dp as *mut f32;
649            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
650            let mid = sp.add(y * w);
651            let bot = if y + 1 < h {
652                sp.add((y + 1) * w)
653            } else {
654                sp.add((h - 1) * w)
655            };
656            box_row_f32_neon(top, mid, bot, dp.add(y * w), w);
657        });
658        return;
659    }
660
661    box_row_f32_neon(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
662    for y in 1..h.saturating_sub(1) {
663        box_row_f32_neon(
664            sp.add((y - 1) * w),
665            sp.add(y * w),
666            sp.add((y + 1) * w),
667            dp.add(y * w),
668            w,
669        );
670    }
671    if h > 1 {
672        box_row_f32_neon(
673            sp.add((h - 2) * w),
674            sp.add((h - 1) * w),
675            sp.add((h - 1) * w),
676            dp.add((h - 1) * w),
677            w,
678        );
679    }
680}
681
682#[cfg(target_arch = "x86_64")]
683#[target_feature(enable = "sse2")]
684#[allow(unsafe_op_in_unsafe_fn)]
685unsafe fn box_row_f32_sse(
686    top: *const f32,
687    mid: *const f32,
688    bot: *const f32,
689    dst: *mut f32,
690    w: usize,
691) {
692    use std::arch::x86_64::*;
693    let inv9 = _mm_set1_ps(1.0 / 9.0);
694
695    let mut x = 0usize;
696    if w >= 6 {
697        x = 1;
698        while x + 4 <= w.saturating_sub(1) {
699            let ht = _mm_add_ps(
700                _mm_add_ps(_mm_loadu_ps(top.add(x - 1)), _mm_loadu_ps(top.add(x))),
701                _mm_loadu_ps(top.add(x + 1)),
702            );
703            let hm = _mm_add_ps(
704                _mm_add_ps(_mm_loadu_ps(mid.add(x - 1)), _mm_loadu_ps(mid.add(x))),
705                _mm_loadu_ps(mid.add(x + 1)),
706            );
707            let hb = _mm_add_ps(
708                _mm_add_ps(_mm_loadu_ps(bot.add(x - 1)), _mm_loadu_ps(bot.add(x))),
709                _mm_loadu_ps(bot.add(x + 1)),
710            );
711            let v = _mm_add_ps(_mm_add_ps(ht, hm), hb);
712            _mm_storeu_ps(dst.add(x), _mm_mul_ps(v, inv9));
713            x += 4;
714        }
715    }
716    let border_start = if x == 0 { 0 } else { x };
717    for xx in border_start..w {
718        let lx = if xx > 0 { xx - 1 } else { 0 };
719        let rx = if xx + 1 < w { xx + 1 } else { w - 1 };
720        let v = *top.add(lx)
721            + *top.add(xx)
722            + *top.add(rx)
723            + *mid.add(lx)
724            + *mid.add(xx)
725            + *mid.add(rx)
726            + *bot.add(lx)
727            + *bot.add(xx)
728            + *bot.add(rx);
729        *dst.add(xx) = v / 9.0;
730    }
731    if border_start == 1 {
732        let lx = 0usize;
733        let rx = 1usize.min(w - 1);
734        let v = *top.add(lx)
735            + *top.add(0)
736            + *top.add(rx)
737            + *mid.add(lx)
738            + *mid.add(0)
739            + *mid.add(rx)
740            + *bot.add(lx)
741            + *bot.add(0)
742            + *bot.add(rx);
743        *dst.add(0) = v / 9.0;
744    }
745}
746
747#[cfg(target_arch = "x86_64")]
748#[target_feature(enable = "sse2")]
749#[allow(unsafe_op_in_unsafe_fn)]
750unsafe fn box_3x3_direct_f32_sse(src: &[f32], out: &mut [f32], h: usize, w: usize) {
751    let sp = src.as_ptr();
752    let dp = out.as_mut_ptr();
753
754    if h * w >= RAYON_THRESHOLD {
755        let sp = sp as usize;
756        let dp = dp as usize;
757        (0..h).into_par_iter().for_each(|y| {
758            let sp = sp as *const f32;
759            let dp = dp as *mut f32;
760            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
761            let mid = sp.add(y * w);
762            let bot = if y + 1 < h {
763                sp.add((y + 1) * w)
764            } else {
765                sp.add((h - 1) * w)
766            };
767            box_row_f32_sse(top, mid, bot, dp.add(y * w), w);
768        });
769        return;
770    }
771
772    box_row_f32_sse(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
773    for y in 1..h.saturating_sub(1) {
774        box_row_f32_sse(
775            sp.add((y - 1) * w),
776            sp.add(y * w),
777            sp.add((y + 1) * w),
778            dp.add(y * w),
779            w,
780        );
781    }
782    if h > 1 {
783        box_row_f32_sse(
784            sp.add((h - 2) * w),
785            sp.add((h - 1) * w),
786            sp.add((h - 1) * w),
787            dp.add((h - 1) * w),
788            w,
789        );
790    }
791}
792
793// ── Dilate 3x3 f32 ────────────────────────────────────────────────────────
794
795/// 3x3 dilation (max) on single-channel f32 image.
796pub fn dilate_3x3_f32(input: &ImageF32) -> Option<ImageF32> {
797    if input.channels() != 1 {
798        return None;
799    }
800    let (h, w) = (input.height(), input.width());
801    if h < 3 || w < 3 {
802        return Some(input.clone());
803    }
804    let src = input.data();
805
806    let mut out = vec![0.0f32; h * w];
807
808    #[cfg(target_arch = "aarch64")]
809    if !cfg!(miri) && std::arch::is_aarch64_feature_detected!("neon") {
810        unsafe {
811            dilate_3x3_direct_f32_neon(src, &mut out, h, w);
812        }
813        return ImageF32::new(out, h, w, 1);
814    }
815
816    #[cfg(target_arch = "x86_64")]
817    if !cfg!(miri) && is_x86_feature_detected!("sse2") {
818        unsafe {
819            dilate_3x3_direct_f32_sse(src, &mut out, h, w);
820        }
821        return ImageF32::new(out, h, w, 1);
822    }
823
824    // Scalar fallback
825    let border = f32::NEG_INFINITY;
826    for y in 0..h {
827        let ay = if y > 0 { y - 1 } else { 0 };
828        let by = if y + 1 < h { y + 1 } else { h - 1 };
829        let top = &src[ay * w..(ay + 1) * w];
830        let mid = &src[y * w..(y + 1) * w];
831        let bot = &src[by * w..(by + 1) * w];
832        let dst = &mut out[y * w..(y + 1) * w];
833        for x in 0..w {
834            let lx = if x > 0 { x - 1 } else { 0 };
835            let rx = if x + 1 < w { x + 1 } else { w - 1 };
836            let mut mx = top[lx].max(top[x]).max(top[rx]);
837            mx = mx.max(mid[lx]).max(mid[x]).max(mid[rx]);
838            mx = mx.max(bot[lx]).max(bot[x]).max(bot[rx]);
839            dst[x] = mx;
840        }
841    }
842    let _ = border;
843    ImageF32::new(out, h, w, 1)
844}
845
846#[cfg(target_arch = "aarch64")]
847#[target_feature(enable = "neon")]
848#[allow(unsafe_op_in_unsafe_fn)]
849unsafe fn dilate_row_f32_neon(
850    top: *const f32,
851    mid: *const f32,
852    bot: *const f32,
853    dst: *mut f32,
854    w: usize,
855) {
856    use std::arch::aarch64::*;
857
858    let border_v = vdupq_n_f32(f32::NEG_INFINITY);
859    let mut prev_t = border_v;
860    let mut prev_m = border_v;
861    let mut prev_b = border_v;
862    let mut cur_t = vld1q_f32(top);
863    let mut cur_m = vld1q_f32(mid);
864    let mut cur_b = vld1q_f32(bot);
865
866    let mut x = 0usize;
867    let main_end = w.saturating_sub(4);
868    while x < main_end {
869        let nxt_t = vld1q_f32(top.add(x + 4));
870        let nxt_m = vld1q_f32(mid.add(x + 4));
871        let nxt_b = vld1q_f32(bot.add(x + 4));
872
873        let r0 = vmaxq_f32(
874            vmaxq_f32(vextq_f32::<3>(prev_t, cur_t), cur_t),
875            vextq_f32::<1>(cur_t, nxt_t),
876        );
877        let r1 = vmaxq_f32(
878            vmaxq_f32(vextq_f32::<3>(prev_m, cur_m), cur_m),
879            vextq_f32::<1>(cur_m, nxt_m),
880        );
881        let r2 = vmaxq_f32(
882            vmaxq_f32(vextq_f32::<3>(prev_b, cur_b), cur_b),
883            vextq_f32::<1>(cur_b, nxt_b),
884        );
885        vst1q_f32(dst.add(x), vmaxq_f32(vmaxq_f32(r0, r1), r2));
886
887        prev_t = cur_t;
888        cur_t = nxt_t;
889        prev_m = cur_m;
890        cur_m = nxt_m;
891        prev_b = cur_b;
892        cur_b = nxt_b;
893        x += 4;
894    }
895    if x < w {
896        let r0 = vmaxq_f32(
897            vmaxq_f32(vextq_f32::<3>(prev_t, cur_t), cur_t),
898            vextq_f32::<1>(cur_t, border_v),
899        );
900        let r1 = vmaxq_f32(
901            vmaxq_f32(vextq_f32::<3>(prev_m, cur_m), cur_m),
902            vextq_f32::<1>(cur_m, border_v),
903        );
904        let r2 = vmaxq_f32(
905            vmaxq_f32(vextq_f32::<3>(prev_b, cur_b), cur_b),
906            vextq_f32::<1>(cur_b, border_v),
907        );
908        vst1q_f32(dst.add(x), vmaxq_f32(vmaxq_f32(r0, r1), r2));
909    }
910}
911
912#[cfg(target_arch = "aarch64")]
913#[target_feature(enable = "neon")]
914#[allow(unsafe_op_in_unsafe_fn)]
915unsafe fn dilate_3x3_direct_f32_neon(src: &[f32], out: &mut [f32], h: usize, w: usize) {
916    let sp = src.as_ptr();
917    let dp = out.as_mut_ptr();
918    // Use row 0 as border (replicate) for dilate
919    // Actually for dilate border should be -inf, but for image data replicate is standard.
920    // We'll use the same row-replicate as gaussian for consistency.
921
922    #[cfg(target_os = "macos")]
923    if h * w >= RAYON_THRESHOLD {
924        let sp = sp as usize;
925        let dp = dp as usize;
926        gcd::parallel_for(h, |y| {
927            let sp = sp as *const f32;
928            let dp = dp as *mut f32;
929            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
930            let mid = sp.add(y * w);
931            let bot = if y + 1 < h {
932                sp.add((y + 1) * w)
933            } else {
934                sp.add((h - 1) * w)
935            };
936            dilate_row_f32_neon(top, mid, bot, dp.add(y * w), w);
937        });
938        return;
939    }
940
941    if h * w >= RAYON_THRESHOLD {
942        let sp = sp as usize;
943        let dp = dp as usize;
944        (0..h).into_par_iter().for_each(|y| {
945            let sp = sp as *const f32;
946            let dp = dp as *mut f32;
947            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
948            let mid = sp.add(y * w);
949            let bot = if y + 1 < h {
950                sp.add((y + 1) * w)
951            } else {
952                sp.add((h - 1) * w)
953            };
954            dilate_row_f32_neon(top, mid, bot, dp.add(y * w), w);
955        });
956        return;
957    }
958
959    dilate_row_f32_neon(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
960    for y in 1..h.saturating_sub(1) {
961        dilate_row_f32_neon(
962            sp.add((y - 1) * w),
963            sp.add(y * w),
964            sp.add((y + 1) * w),
965            dp.add(y * w),
966            w,
967        );
968    }
969    if h > 1 {
970        dilate_row_f32_neon(
971            sp.add((h - 2) * w),
972            sp.add((h - 1) * w),
973            sp.add((h - 1) * w),
974            dp.add((h - 1) * w),
975            w,
976        );
977    }
978}
979
980#[cfg(target_arch = "x86_64")]
981#[target_feature(enable = "sse2")]
982#[allow(unsafe_op_in_unsafe_fn)]
983unsafe fn dilate_row_f32_sse(
984    top: *const f32,
985    mid: *const f32,
986    bot: *const f32,
987    dst: *mut f32,
988    w: usize,
989) {
990    use std::arch::x86_64::*;
991    let mut x = 0usize;
992    if w >= 6 {
993        x = 1;
994        while x + 4 <= w.saturating_sub(1) {
995            let r0 = _mm_max_ps(
996                _mm_max_ps(_mm_loadu_ps(top.add(x - 1)), _mm_loadu_ps(top.add(x))),
997                _mm_loadu_ps(top.add(x + 1)),
998            );
999            let r1 = _mm_max_ps(
1000                _mm_max_ps(_mm_loadu_ps(mid.add(x - 1)), _mm_loadu_ps(mid.add(x))),
1001                _mm_loadu_ps(mid.add(x + 1)),
1002            );
1003            let r2 = _mm_max_ps(
1004                _mm_max_ps(_mm_loadu_ps(bot.add(x - 1)), _mm_loadu_ps(bot.add(x))),
1005                _mm_loadu_ps(bot.add(x + 1)),
1006            );
1007            _mm_storeu_ps(dst.add(x), _mm_max_ps(_mm_max_ps(r0, r1), r2));
1008            x += 4;
1009        }
1010    }
1011    let border_start = if x == 0 { 0 } else { x };
1012    for xx in border_start..w {
1013        let lx = if xx > 0 { xx - 1 } else { 0 };
1014        let rx = if xx + 1 < w { xx + 1 } else { w - 1 };
1015        let mut mx = (*top.add(lx)).max(*top.add(xx)).max(*top.add(rx));
1016        mx = mx.max(*mid.add(lx)).max(*mid.add(xx)).max(*mid.add(rx));
1017        mx = mx.max(*bot.add(lx)).max(*bot.add(xx)).max(*bot.add(rx));
1018        *dst.add(xx) = mx;
1019    }
1020    if border_start == 1 {
1021        let lx = 0usize;
1022        let rx = 1usize.min(w - 1);
1023        let mut mx = (*top.add(lx)).max(*top.add(0)).max(*top.add(rx));
1024        mx = mx.max(*mid.add(lx)).max(*mid.add(0)).max(*mid.add(rx));
1025        mx = mx.max(*bot.add(lx)).max(*bot.add(0)).max(*bot.add(rx));
1026        *dst.add(0) = mx;
1027    }
1028}
1029
1030#[cfg(target_arch = "x86_64")]
1031#[target_feature(enable = "sse2")]
1032#[allow(unsafe_op_in_unsafe_fn)]
1033unsafe fn dilate_3x3_direct_f32_sse(src: &[f32], out: &mut [f32], h: usize, w: usize) {
1034    let sp = src.as_ptr();
1035    let dp = out.as_mut_ptr();
1036
1037    if h * w >= RAYON_THRESHOLD {
1038        let sp = sp as usize;
1039        let dp = dp as usize;
1040        (0..h).into_par_iter().for_each(|y| {
1041            let sp = sp as *const f32;
1042            let dp = dp as *mut f32;
1043            let top = if y > 0 { sp.add((y - 1) * w) } else { sp };
1044            let mid = sp.add(y * w);
1045            let bot = if y + 1 < h {
1046                sp.add((y + 1) * w)
1047            } else {
1048                sp.add((h - 1) * w)
1049            };
1050            dilate_row_f32_sse(top, mid, bot, dp.add(y * w), w);
1051        });
1052        return;
1053    }
1054
1055    dilate_row_f32_sse(sp, sp, if h > 1 { sp.add(w) } else { sp }, dp, w);
1056    for y in 1..h.saturating_sub(1) {
1057        dilate_row_f32_sse(
1058            sp.add((y - 1) * w),
1059            sp.add(y * w),
1060            sp.add((y + 1) * w),
1061            dp.add(y * w),
1062            w,
1063        );
1064    }
1065    if h > 1 {
1066        dilate_row_f32_sse(
1067            sp.add((h - 2) * w),
1068            sp.add((h - 1) * w),
1069            sp.add((h - 1) * w),
1070            dp.add((h - 1) * w),
1071            w,
1072        );
1073    }
1074}
1075
1076// ── Sobel 3x3 f32 ─────────────────────────────────────────────────────────
1077
1078/// 3x3 Sobel gradient magnitude on single-channel f32 image.
1079/// Returns `|Gx| + |Gy|` approximation. Border pixels are zero.
1080pub fn sobel_3x3_f32(input: &ImageF32) -> Option<ImageF32> {
1081    if input.channels() != 1 {
1082        return None;
1083    }
1084    let (h, w) = (input.height(), input.width());
1085    if h < 3 || w < 3 {
1086        return Some(ImageF32::zeros(h, w, 1));
1087    }
1088    let src = input.data();
1089
1090    let mut out = vec![0.0f32; h * w];
1091
1092    let use_rayon = h * w >= RAYON_THRESHOLD && !cfg!(miri);
1093    let interior_h = h - 2;
1094
1095    let process_row = |row0: &[f32], row1: &[f32], row2: &[f32], dst: &mut [f32]| {
1096        let done = sobel_f32_simd_row(row0, row1, row2, dst, w);
1097        for x in done..w - 1 {
1098            if x == 0 {
1099                continue;
1100            }
1101            let gx = row0[x + 1] - row0[x - 1] + 2.0 * (row1[x + 1] - row1[x - 1]) + row2[x + 1]
1102                - row2[x - 1];
1103            let gy = row2[x - 1] + 2.0 * row2[x] + row2[x + 1]
1104                - row0[x - 1]
1105                - 2.0 * row0[x]
1106                - row0[x + 1];
1107            dst[x] = gx.abs() + gy.abs();
1108        }
1109    };
1110
1111    if use_rayon && interior_h > 1 {
1112        out[w..w + interior_h * w]
1113            .par_chunks_mut(w)
1114            .enumerate()
1115            .for_each(|(i, dst)| {
1116                let y = i + 1;
1117                let row0 = &src[(y - 1) * w..y * w];
1118                let row1 = &src[y * w..(y + 1) * w];
1119                let row2 = &src[(y + 1) * w..(y + 2) * w];
1120                process_row(row0, row1, row2, dst);
1121            });
1122    } else {
1123        for y in 1..h - 1 {
1124            let row0 = &src[(y - 1) * w..y * w];
1125            let row1 = &src[y * w..(y + 1) * w];
1126            let row2 = &src[(y + 1) * w..(y + 2) * w];
1127            let dst = &mut out[y * w..(y + 1) * w];
1128            process_row(row0, row1, row2, dst);
1129        }
1130    }
1131
1132    ImageF32::new(out, h, w, 1)
1133}
1134
1135fn sobel_f32_simd_row(
1136    row0: &[f32],
1137    row1: &[f32],
1138    row2: &[f32],
1139    dst: &mut [f32],
1140    w: usize,
1141) -> usize {
1142    if cfg!(miri) || w < 6 {
1143        return 1;
1144    }
1145
1146    #[cfg(target_arch = "aarch64")]
1147    {
1148        if std::arch::is_aarch64_feature_detected!("neon") {
1149            return unsafe { sobel_f32_neon_row(row0, row1, row2, dst, w) };
1150        }
1151    }
1152    #[cfg(target_arch = "x86_64")]
1153    {
1154        if is_x86_feature_detected!("sse2") {
1155            return unsafe { sobel_f32_sse_row(row0, row1, row2, dst, w) };
1156        }
1157    }
1158    1
1159}
1160
1161#[cfg(target_arch = "aarch64")]
1162#[target_feature(enable = "neon")]
1163#[allow(unsafe_op_in_unsafe_fn)]
1164unsafe fn sobel_f32_neon_row(
1165    row0: &[f32],
1166    row1: &[f32],
1167    row2: &[f32],
1168    dst: &mut [f32],
1169    w: usize,
1170) -> usize {
1171    use std::arch::aarch64::*;
1172    let two = vdupq_n_f32(2.0);
1173    let p0 = row0.as_ptr();
1174    let p1 = row1.as_ptr();
1175    let p2 = row2.as_ptr();
1176    let dp = dst.as_mut_ptr();
1177
1178    let mut x = 1usize;
1179    while x + 4 <= w - 1 {
1180        let t_l = vld1q_f32(p0.add(x - 1));
1181        let t_c = vld1q_f32(p0.add(x));
1182        let t_r = vld1q_f32(p0.add(x + 1));
1183        let m_l = vld1q_f32(p1.add(x - 1));
1184        let m_r = vld1q_f32(p1.add(x + 1));
1185        let b_l = vld1q_f32(p2.add(x - 1));
1186        let b_c = vld1q_f32(p2.add(x));
1187        let b_r = vld1q_f32(p2.add(x + 1));
1188
1189        // Gx = t_r - t_l + 2*(m_r - m_l) + b_r - b_l
1190        let gx = vaddq_f32(
1191            vaddq_f32(vsubq_f32(t_r, t_l), vsubq_f32(b_r, b_l)),
1192            vmulq_f32(vsubq_f32(m_r, m_l), two),
1193        );
1194        // Gy = b_l + 2*b_c + b_r - t_l - 2*t_c - t_r
1195        let gy = vsubq_f32(
1196            vfmaq_f32(vaddq_f32(b_l, b_r), b_c, two),
1197            vfmaq_f32(vaddq_f32(t_l, t_r), t_c, two),
1198        );
1199        let mag = vaddq_f32(vabsq_f32(gx), vabsq_f32(gy));
1200        vst1q_f32(dp.add(x), mag);
1201        x += 4;
1202    }
1203    x
1204}
1205
1206#[cfg(target_arch = "x86_64")]
1207#[target_feature(enable = "sse2")]
1208#[allow(unsafe_op_in_unsafe_fn)]
1209unsafe fn sobel_f32_sse_row(
1210    row0: &[f32],
1211    row1: &[f32],
1212    row2: &[f32],
1213    dst: &mut [f32],
1214    w: usize,
1215) -> usize {
1216    use std::arch::x86_64::*;
1217    let two = _mm_set1_ps(2.0);
1218    let sign_mask = _mm_set1_ps(-0.0); // sign bit mask for abs
1219    let p0 = row0.as_ptr();
1220    let p1 = row1.as_ptr();
1221    let p2 = row2.as_ptr();
1222    let dp = dst.as_mut_ptr();
1223
1224    let mut x = 1usize;
1225    while x + 4 <= w - 1 {
1226        let t_l = _mm_loadu_ps(p0.add(x - 1));
1227        let t_c = _mm_loadu_ps(p0.add(x));
1228        let t_r = _mm_loadu_ps(p0.add(x + 1));
1229        let m_l = _mm_loadu_ps(p1.add(x - 1));
1230        let m_r = _mm_loadu_ps(p1.add(x + 1));
1231        let b_l = _mm_loadu_ps(p2.add(x - 1));
1232        let b_c = _mm_loadu_ps(p2.add(x));
1233        let b_r = _mm_loadu_ps(p2.add(x + 1));
1234
1235        let gx = _mm_add_ps(
1236            _mm_add_ps(_mm_sub_ps(t_r, t_l), _mm_sub_ps(b_r, b_l)),
1237            _mm_mul_ps(_mm_sub_ps(m_r, m_l), two),
1238        );
1239        let gy = _mm_sub_ps(
1240            _mm_add_ps(_mm_add_ps(b_l, b_r), _mm_mul_ps(b_c, two)),
1241            _mm_add_ps(_mm_add_ps(t_l, t_r), _mm_mul_ps(t_c, two)),
1242        );
1243        let abs_gx = _mm_andnot_ps(sign_mask, gx);
1244        let abs_gy = _mm_andnot_ps(sign_mask, gy);
1245        _mm_storeu_ps(dp.add(x), _mm_add_ps(abs_gx, abs_gy));
1246        x += 4;
1247    }
1248    x
1249}
1250
1251// ── Threshold binary f32 ──────────────────────────────────────────────────
1252
1253/// Binary threshold on single-channel f32 image.
1254/// Pixels > thresh become max_val, otherwise 0.
1255#[allow(clippy::uninit_vec)]
1256pub fn threshold_binary_f32(input: &ImageF32, thresh: f32, max_val: f32) -> Option<ImageF32> {
1257    if input.channels() != 1 {
1258        return None;
1259    }
1260    let (h, w) = (input.height(), input.width());
1261    let src = input.data();
1262    let total = h * w;
1263
1264    let mut out = Vec::with_capacity(total);
1265    #[allow(unsafe_code)]
1266    unsafe {
1267        out.set_len(total);
1268    }
1269
1270    // GCD parallel in chunks (bandwidth-bound, trivial compute)
1271    #[cfg(target_os = "macos")]
1272    if total >= RAYON_THRESHOLD && !cfg!(miri) {
1273        let n_chunks = 8usize.min(h);
1274        let chunk_h = h.div_ceil(n_chunks);
1275        let sp = src.as_ptr() as usize;
1276        let dp = out.as_mut_ptr() as usize;
1277        gcd::parallel_for(n_chunks, |chunk| {
1278            let sp = sp as *const f32;
1279            let dp = dp as *mut f32;
1280            let start = chunk * chunk_h * w;
1281            let end = (((chunk + 1) * chunk_h).min(h)) * w;
1282            let n = end - start;
1283            let chunk_src = unsafe { core::slice::from_raw_parts(sp.add(start), n) };
1284            let chunk_dst = unsafe { core::slice::from_raw_parts_mut(dp.add(start), n) };
1285            let done = threshold_f32_simd(chunk_src, chunk_dst, n, thresh, max_val);
1286            for i in done..n {
1287                chunk_dst[i] = if chunk_src[i] > thresh { max_val } else { 0.0 };
1288            }
1289        });
1290        return ImageF32::new(out, h, w, 1);
1291    }
1292
1293    let done = threshold_f32_simd(src, &mut out, total, thresh, max_val);
1294    for i in done..total {
1295        out[i] = if src[i] > thresh { max_val } else { 0.0 };
1296    }
1297
1298    ImageF32::new(out, h, w, 1)
1299}
1300
1301fn threshold_f32_simd(
1302    src: &[f32],
1303    dst: &mut [f32],
1304    total: usize,
1305    thresh: f32,
1306    max_val: f32,
1307) -> usize {
1308    if cfg!(miri) || total < 4 {
1309        return 0;
1310    }
1311
1312    #[cfg(target_arch = "aarch64")]
1313    {
1314        if std::arch::is_aarch64_feature_detected!("neon") {
1315            return unsafe { threshold_f32_neon(src, dst, total, thresh, max_val) };
1316        }
1317    }
1318    #[cfg(target_arch = "x86_64")]
1319    {
1320        if is_x86_feature_detected!("sse2") {
1321            return unsafe { threshold_f32_sse(src, dst, total, thresh, max_val) };
1322        }
1323    }
1324    0
1325}
1326
1327#[cfg(target_arch = "aarch64")]
1328#[target_feature(enable = "neon")]
1329#[allow(unsafe_op_in_unsafe_fn)]
1330unsafe fn threshold_f32_neon(
1331    src: &[f32],
1332    dst: &mut [f32],
1333    total: usize,
1334    thresh: f32,
1335    max_val: f32,
1336) -> usize {
1337    use std::arch::aarch64::*;
1338    let thresh_v = vdupq_n_f32(thresh);
1339    let max_v = vdupq_n_f32(max_val);
1340    let zero = vdupq_n_f32(0.0);
1341    let sp = src.as_ptr();
1342    let dp = dst.as_mut_ptr();
1343    let mut i = 0usize;
1344    while i + 4 <= total {
1345        let v = vld1q_f32(sp.add(i));
1346        let mask = vcgtq_f32(v, thresh_v); // v > thresh → all-ones
1347        let result = vbslq_f32(mask, max_v, zero);
1348        vst1q_f32(dp.add(i), result);
1349        i += 4;
1350    }
1351    i
1352}
1353
1354#[cfg(target_arch = "x86_64")]
1355#[target_feature(enable = "sse2")]
1356#[allow(unsafe_op_in_unsafe_fn)]
1357unsafe fn threshold_f32_sse(
1358    src: &[f32],
1359    dst: &mut [f32],
1360    total: usize,
1361    thresh: f32,
1362    max_val: f32,
1363) -> usize {
1364    use std::arch::x86_64::*;
1365    let thresh_v = _mm_set1_ps(thresh);
1366    let max_v = _mm_set1_ps(max_val);
1367    let sp = src.as_ptr();
1368    let dp = dst.as_mut_ptr();
1369    let mut i = 0usize;
1370    while i + 4 <= total {
1371        let v = _mm_loadu_ps(sp.add(i));
1372        let mask = _mm_cmpgt_ps(v, thresh_v); // v > thresh → all-ones
1373        let result = _mm_and_ps(mask, max_v);
1374        _mm_storeu_ps(dp.add(i), result);
1375        i += 4;
1376    }
1377    i
1378}