Skip to main content

locus_core/
threshold.rs

1//! Adaptive thresholding and image binarization.
2//!
3//! This module provides the first stage of the detection pipeline, converting grayscale
4//! input into binary images while adapting to local lighting conditions.
5//!
6//! It features two primary implementations:
7//! 1. **Tile-based Thresholding**: Blazing fast approach using local tile stats.
8//! 2. **Integral Image Thresholding**: Per-pixel adaptive thresholding for small features.
9
10#![allow(unsafe_code, clippy::cast_sign_loss)]
11use crate::config::DetectorConfig;
12use crate::image::ImageView;
13use bumpalo::Bump;
14use bumpalo::collections::Vec as BumpVec;
15use multiversion::multiversion;
16use rayon::prelude::*;
17
18/// Statistics for a single threshold tile.
19#[derive(Clone, Copy, Debug, Default)]
20pub struct TileStats {
21    /// Minimum pixel value in the tile.
22    pub min: u8,
23    /// Maximum pixel value in the tile.
24    pub max: u8,
25}
26
27/// Adaptive thresholding engine using tile-based stats.
28pub struct ThresholdEngine {
29    /// Size of the tiles used for local thresholding statistics.
30    pub tile_size: usize,
31    /// Minimum intensity range for a tile to be considered valid.
32    pub min_range: u8,
33}
34
35impl Default for ThresholdEngine {
36    fn default() -> Self {
37        Self::new()
38    }
39}
40
41impl ThresholdEngine {
42    /// Create a new ThresholdEngine with default settings.
43    #[must_use]
44    pub fn new() -> Self {
45        Self {
46            tile_size: 8,  // Standard 8x8 tiles
47            min_range: 10, // Match DetectorConfig::default()
48        }
49    }
50
51    /// Create a ThresholdEngine from detector configuration.
52    #[must_use]
53    pub fn from_config(config: &DetectorConfig) -> Self {
54        Self {
55            tile_size: config.threshold_tile_size,
56            min_range: config.threshold_min_range,
57        }
58    }
59
60    /// Compute min/max statistics for each tile in the image.
61    /// Optimized with SIMD-friendly memory access patterns and subsampling (stride 2).
62    #[must_use]
63    #[tracing::instrument(skip_all, name = "pipeline::threshold_compute_stats")]
64    pub fn compute_tile_stats<'a>(
65        &self,
66        arena: &'a Bump,
67        img: &ImageView,
68    ) -> BumpVec<'a, TileStats> {
69        let ts = self.tile_size;
70        let tiles_wide = img.width / ts;
71        let tiles_high = img.height / ts;
72        let mut stats = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
73        stats.resize(tiles_wide * tiles_high, TileStats { min: 255, max: 0 });
74
75        stats
76            .par_chunks_mut(tiles_wide)
77            .enumerate()
78            .for_each(|(ty, stats_row)| {
79                // Subsampling: Only process every other row within a tile (stride 2)
80                // This statistically approximates the min/max sufficient for thresholding
81                for dy in 0..ts {
82                    let py = ty * ts + dy;
83                    let src_row = img.get_row(py);
84
85                    // Process all tiles in this row with SIMD-friendly min/max
86                    compute_row_tile_stats_simd(src_row, stats_row, ts);
87                }
88            });
89        stats
90    }
91
92    /// Apply adaptive thresholding to the image.
93    /// Optimized with pre-expanded threshold maps and vectorized row processing.
94    #[allow(clippy::too_many_lines, clippy::needless_range_loop)]
95    #[allow(dead_code)]
96    pub fn apply_threshold(
97        &self,
98        arena: &Bump,
99        img: &ImageView,
100        stats: &[TileStats],
101        output: &mut [u8],
102    ) {
103        let ts = self.tile_size;
104        let tiles_wide = img.width / ts;
105        let tiles_high = img.height / ts;
106
107        let mut tile_thresholds = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
108        tile_thresholds.resize(tiles_wide * tiles_high, 0u8);
109        let mut tile_valid = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
110        tile_valid.resize(tiles_wide * tiles_high, 0u8);
111
112        tile_thresholds
113            .par_chunks_mut(tiles_wide)
114            .enumerate()
115            .for_each(|(ty, t_row)| {
116                let y_start = ty.saturating_sub(1);
117                let y_end = (ty + 1).min(tiles_high - 1);
118
119                for tx in 0..tiles_wide {
120                    let mut nmin = 255u8;
121                    let mut nmax = 0u8;
122
123                    let x_start = tx.saturating_sub(1);
124                    let x_end = (tx + 1).min(tiles_wide - 1);
125
126                    for ny in y_start..=y_end {
127                        let row_off = ny * tiles_wide;
128                        for nx in x_start..=x_end {
129                            let s = stats[row_off + nx];
130                            if s.min < nmin {
131                                nmin = s.min;
132                            }
133                            if s.max > nmax {
134                                nmax = s.max;
135                            }
136                        }
137                    }
138
139                    let t_idx = tx;
140                    let res = ((u16::from(nmin) + u16::from(nmax)) >> 1) as u8;
141
142                    // Safety: Unique index per thread for tile_valid access via raw pointer to avoid multiple mutable borrows of the same slice
143                    // However, tile_valid is not yet parallelized here. Let's fix that.
144                    t_row[t_idx] = res;
145                }
146            });
147
148        // Compute tile_valid (can be done in same loop above or separate)
149        for ty in 0..tiles_high {
150            for tx in 0..tiles_wide {
151                let mut nmin = 255;
152                let mut nmax = 0;
153                let y_start = ty.saturating_sub(1);
154                let y_end = (ty + 1).min(tiles_high - 1);
155                let x_start = tx.saturating_sub(1);
156                let x_end = (tx + 1).min(tiles_wide - 1);
157
158                for ny in y_start..=y_end {
159                    let row_off = ny * tiles_wide;
160                    for nx in x_start..=x_end {
161                        let s = stats[row_off + nx];
162                        if s.min < nmin {
163                            nmin = s.min;
164                        }
165                        if s.max > nmax {
166                            nmax = s.max;
167                        }
168                    }
169                }
170                let idx = ty * tiles_wide + tx;
171                tile_valid[idx] = if nmax.saturating_sub(nmin) < self.min_range {
172                    0
173                } else {
174                    255
175                };
176            }
177        }
178
179        // --- Propagation Pass ---
180        // Fill thresholds for invalid tiles from their neighbors to stay
181        // consistent within large uniform regions.
182        for _ in 0..2 {
183            // 2 iterations are usually enough for local consistency
184            for ty in 0..tiles_high {
185                for tx in 0..tiles_wide {
186                    let idx = ty * tiles_wide + tx;
187                    if tile_valid[idx] == 0 {
188                        let mut sum_thresh = 0u32;
189                        let mut count = 0u32;
190
191                        let y_start = ty.saturating_sub(1);
192                        let y_end = (ty + 1).min(tiles_high - 1);
193                        let x_start = tx.saturating_sub(1);
194                        let x_end = (tx + 1).min(tiles_wide - 1);
195
196                        for ny in y_start..=y_end {
197                            let row_off = ny * tiles_wide;
198                            for nx in x_start..=x_end {
199                                let n_idx = row_off + nx;
200                                if tile_valid[n_idx] > 0 {
201                                    sum_thresh += u32::from(tile_thresholds[n_idx]);
202                                    count += 1;
203                                }
204                            }
205                        }
206
207                        if count > 0 {
208                            tile_thresholds[idx] = (sum_thresh / count) as u8;
209                            tile_valid[idx] = 128; // Partial valid (propagated)
210                        }
211                    }
212                }
213            }
214        }
215
216        let thresholds_slice = tile_thresholds.as_slice();
217        let valid_slice = tile_valid.as_slice();
218
219        output
220            .par_chunks_mut(ts * img.width)
221            .enumerate()
222            .for_each_init(
223                || (vec![0u8; img.width], vec![0u8; img.width]),
224                |(row_thresholds, row_valid), (ty, output_tile_rows)| {
225                    if ty >= tiles_high {
226                        return;
227                    }
228                    // Reset buffers? No, they are fully overwritten below by the loops over tiles and pixels.
229                    // Just ensure they are correct size if they were hypothetically resized (they are not).
230                    // Actually, the loop logic depends on writing to `row_thresholds` and `row_valid` at specific indices.
231                    // The loops:
232                    // for tx in 0..tiles_wide ... row_thresholds[tx*ts+i] = ...
233                    // This covers [0 .. tiles_wide*ts].
234                    // If tiles_wide * ts < img.width (e.g. edge cases), remaining pixels might be stale.
235                    // However, the original code `vec![0u8; img.width]` zero-inits.
236                    // The previous loop logic:
237                    // `for tx in 0..tiles_wide` covers up to `tiles_wide * ts`.
238                    // `img.width` might be slightly larger than `tiles_wide * ts`.
239                    // The original code zero-initialized the WHOLE vector.
240                    // To maintain correctness, we should zero-init (or fill with default) the buffers, or at least the tail.
241                    // Or better, just fill them entirely if cheap, or rely on logic correctness.
242                    // Let's look at `threshold_row_simd`. It reads `thresholds[i]` and `valid_mask[i]`.
243                    // It iterates `0..len` where len is `src.len()` (== img.width).
244                    // If `row_thresholds` has stale garbage at the end, `threshold_row_simd` will use it.
245                    // The original code produced 0s at the end (due to `vec![0u8; ...]`).
246                    // So we MUST zero-init or fill the reused buffers.
247                    // `row_thresholds.fill(0)` and `row_valid.fill(0)` is safe and fast enough (memset).
248
249                    row_thresholds.fill(0);
250                    row_valid.fill(0);
251
252                    // Expand tile stats to row buffers
253                    for tx in 0..tiles_wide {
254                        let idx = ty * tiles_wide + tx;
255                        let thresh = thresholds_slice[idx];
256                        let valid = valid_slice[idx];
257                        for i in 0..ts {
258                            row_thresholds[tx * ts + i] = thresh;
259                            row_valid[tx * ts + i] = valid;
260                        }
261                    }
262
263                    for dy in 0..ts {
264                        let py = ty * ts + dy;
265                        let src_row = img.get_row(py);
266                        let dst_row = &mut output_tile_rows[dy * img.width..(dy + 1) * img.width];
267
268                        threshold_row_simd(src_row, dst_row, row_thresholds, row_valid);
269                    }
270                },
271            );
272    }
273
274    /// Apply adaptive thresholding and return both binary and threshold maps.
275    ///
276    /// This is needed for threshold-model-aware segmentation, which uses the
277    /// per-pixel threshold values to connect pixels by their deviation sign.
278    #[allow(clippy::needless_range_loop)]
279    #[tracing::instrument(skip_all, name = "pipeline::threshold_apply_map")]
280    pub fn apply_threshold_with_map(
281        &self,
282        arena: &Bump,
283        img: &ImageView,
284        stats: &[TileStats],
285        binary_output: &mut [u8],
286        threshold_output: &mut [u8],
287    ) {
288        let ts = self.tile_size;
289        let tiles_wide = img.width / ts;
290        let tiles_high = img.height / ts;
291
292        let mut tile_thresholds = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
293        tile_thresholds.resize(tiles_wide * tiles_high, 0u8);
294        let mut tile_valid = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
295        tile_valid.resize(tiles_wide * tiles_high, 0u8);
296
297        tile_thresholds
298            .par_chunks_mut(tiles_wide)
299            .enumerate()
300            .for_each(|(ty, t_row)| {
301                let y_start = ty.saturating_sub(1);
302                let y_end = (ty + 1).min(tiles_high - 1);
303
304                for tx in 0..tiles_wide {
305                    let mut nmin = 255u8;
306                    let mut nmax = 0u8;
307
308                    let x_start = tx.saturating_sub(1);
309                    let x_end = (tx + 1).min(tiles_wide - 1);
310
311                    for ny in y_start..=y_end {
312                        let row_off = ny * tiles_wide;
313                        for nx in x_start..=x_end {
314                            let s = stats[row_off + nx];
315                            if s.min < nmin {
316                                nmin = s.min;
317                            }
318                            if s.max > nmax {
319                                nmax = s.max;
320                            }
321                        }
322                    }
323
324                    t_row[tx] = ((u16::from(nmin) + u16::from(nmax)) >> 1) as u8;
325                }
326            });
327
328        // Compute tile_valid
329        for ty in 0..tiles_high {
330            for tx in 0..tiles_wide {
331                let mut nmin = 255;
332                let mut nmax = 0;
333                let y_start = ty.saturating_sub(1);
334                let y_end = (ty + 1).min(tiles_high - 1);
335                let x_start = tx.saturating_sub(1);
336                let x_end = (tx + 1).min(tiles_wide - 1);
337
338                for ny in y_start..=y_end {
339                    let row_off = ny * tiles_wide;
340                    for nx in x_start..=x_end {
341                        let s = stats[row_off + nx];
342                        if s.min < nmin {
343                            nmin = s.min;
344                        }
345                        if s.max > nmax {
346                            nmax = s.max;
347                        }
348                    }
349                }
350                let idx = ty * tiles_wide + tx;
351                tile_valid[idx] = if nmax.saturating_sub(nmin) < self.min_range {
352                    0
353                } else {
354                    255
355                };
356            }
357        }
358
359        // Propagation pass
360        /*
361        for _ in 0..2 {
362            for ty in 0..tiles_high {
363                for tx in 0..tiles_wide {
364                    let idx = ty * tiles_wide + tx;
365                    if tile_valid[idx] == 0 {
366                        let mut sum_thresh = 0u32;
367                        let mut count = 0u32;
368
369                        let y_start = ty.saturating_sub(1);
370                        let y_end = (ty + 1).min(tiles_high - 1);
371                        let x_start = tx.saturating_sub(1);
372                        let x_end = (tx + 1).min(tiles_wide - 1);
373
374                        for ny in y_start..=y_end {
375                            let row_off = ny * tiles_wide;
376                            for nx in x_start..=x_end {
377                                let n_idx = row_off + nx;
378                                if tile_valid[n_idx] > 0 {
379                                    sum_thresh += u32::from(tile_thresholds[n_idx]);
380                                    count += 1;
381                                }
382                            }
383                        }
384
385                        if count > 0 {
386                            tile_thresholds[idx] = (sum_thresh / count) as u8;
387                            tile_valid[idx] = 128;
388                        }
389                    }
390                }
391            }
392        }
393        */
394
395        // Write thresholds and binary output in parallel
396        let thresholds_slice = tile_thresholds.as_slice();
397        let valid_slice = tile_valid.as_slice();
398
399        binary_output
400            .par_chunks_mut(ts * img.width)
401            .enumerate()
402            .for_each_init(
403                || (vec![0u8; img.width], vec![0u8; img.width]),
404                |(row_thresholds, row_valid), (ty, bin_tile_rows)| {
405                    if ty >= tiles_high {
406                        return;
407                    }
408                    // Safety: Each thread writes to unique portion of threshold_output
409                    let thresh_tile_rows = unsafe {
410                        let ptr = threshold_output.as_ptr().cast_mut();
411                        std::slice::from_raw_parts_mut(ptr.add(ty * ts * img.width), ts * img.width)
412                    };
413
414                    row_thresholds.fill(0);
415                    row_valid.fill(0);
416
417                    for tx in 0..tiles_wide {
418                        let idx = ty * tiles_wide + tx;
419                        let thresh = thresholds_slice[idx];
420                        let valid = valid_slice[idx];
421                        for i in 0..ts {
422                            row_thresholds[tx * ts + i] = thresh;
423                            row_valid[tx * ts + i] = valid;
424                        }
425                    }
426
427                    for dy in 0..ts {
428                        let py = ty * ts + dy;
429                        let src_row = img.get_row(py);
430
431                        // Write binary output
432                        let bin_row = &mut bin_tile_rows[dy * img.width..(dy + 1) * img.width];
433                        threshold_row_simd(src_row, bin_row, row_thresholds, row_valid);
434
435                        // Write threshold map
436                        thresh_tile_rows[dy * img.width..(dy + 1) * img.width]
437                            .copy_from_slice(row_thresholds);
438                    }
439                },
440            );
441    }
442}
443
444#[multiversion(targets(
445    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
446    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
447    "aarch64+neon"
448))]
449fn compute_row_tile_stats_simd(src_row: &[u8], stats: &mut [TileStats], tile_size: usize) {
450    let chunks = src_row.chunks_exact(tile_size);
451    for (chunk, stat) in chunks.zip(stats.iter_mut()) {
452        let mut rmin = stat.min;
453        let mut rmax = stat.max;
454
455        // Hint to compiler for vectorization
456        for &p in chunk {
457            rmin = rmin.min(p);
458            rmax = rmax.max(p);
459        }
460
461        stat.min = rmin;
462        stat.max = rmax;
463    }
464}
465
466#[multiversion(targets(
467    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
468    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
469    "aarch64+neon"
470))]
471fn threshold_row_simd(src: &[u8], dst: &mut [u8], thresholds: &[u8], valid_mask: &[u8]) {
472    let len = src.len();
473    // Use chunks_exact to help compiler vectorize (e.g. into 16 or 32-byte chunks)
474    let src_chunks = src.chunks_exact(16);
475    let dst_chunks = dst.chunks_exact_mut(16);
476    let thresh_chunks = thresholds.chunks_exact(16);
477    let mask_chunks = valid_mask.chunks_exact(16);
478
479    for (((s_c, d_c), t_c), m_c) in src_chunks
480        .zip(dst_chunks)
481        .zip(thresh_chunks)
482        .zip(mask_chunks)
483    {
484        for i in 0..16 {
485            let s = s_c[i];
486            let t = t_c[i];
487            let v = m_c[i];
488            // Nested if structure often helps compiler with CMV/Masks
489            d_c[i] = if v > 0 {
490                if s < t { 0 } else { 255 }
491            } else {
492                255
493            };
494        }
495    }
496
497    // Handle tail
498    let processed = (len / 16) * 16;
499    for i in processed..len {
500        let s = src[i];
501        let t = thresholds[i];
502        let v = valid_mask[i];
503        dst[i] = if v > 0 {
504            if s < t { 0 } else { 255 }
505        } else {
506            255
507        };
508    }
509}
510
511#[cfg(test)]
512#[allow(
513    clippy::unwrap_used,
514    clippy::naive_bytecount,
515    clippy::cast_possible_truncation,
516    clippy::cast_sign_loss
517)]
518mod tests {
519    use super::*;
520    use proptest::prelude::*;
521
522    proptest! {
523        #[test]
524        fn test_threshold_invariants(data in prop::collection::vec(0..=255u8, 16)) {
525            let mut min = 255u8;
526            let mut max = 0u8;
527            for &b in &data {
528                if b < min { min = b; }
529                if b > max { max = b; }
530            }
531            let (rmin, rmax) = compute_min_max_simd(&data);
532            assert_eq!(rmin, min);
533            assert_eq!(rmax, max);
534        }
535
536        #[test]
537        fn test_binarization_invariants(src in prop::collection::vec(0..=255u8, 16), thresh in 0..=255u8) {
538            let mut dst = vec![0u8; 16];
539            let valid = vec![255u8; 16];
540            threshold_row_simd(&src, &mut dst, &[thresh; 16], &valid);
541
542            for (i, &s) in src.iter().enumerate() {
543                if s >= thresh {
544                    assert_eq!(dst[i], 255);
545                } else {
546                    assert_eq!(dst[i], 0);
547                }
548            }
549
550            // Test invalid tile (mask=0) should be white (255)
551            let invalid = vec![0u8; 16];
552            threshold_row_simd(&src, &mut dst, &[thresh; 16], &invalid);
553            for d in dst {
554                assert_eq!(d, 255);
555            }
556        }
557    }
558
559    #[test]
560    fn test_threshold_engine_e2e() {
561        let width = 16;
562        let height = 16;
563        let mut data = vec![128u8; width * height];
564        // Draw a black square in a white background area to test adaptive threshold
565        for y in 4..12 {
566            for x in 4..12 {
567                data[y * width + x] = 50;
568            }
569        }
570        for y in 0..height {
571            for x in 0..width {
572                if !(2..=14).contains(&x) || !(2..=14).contains(&y) {
573                    data[y * width + x] = 200;
574                }
575            }
576        }
577
578        let img = ImageView::new(&data, width, height, width).unwrap();
579        let engine = ThresholdEngine::new();
580        let arena = Bump::new();
581        let stats = engine.compute_tile_stats(&arena, &img);
582        let mut output = vec![0u8; width * height];
583        engine.apply_threshold(&arena, &img, &stats, &mut output);
584
585        // At (8,8), it should be black (0) because it's 50 and thresh should be around (50+200)/2 = 125
586        assert_eq!(output[8 * width + 8], 0);
587        // At (1,1), it should be white (255)
588        assert_eq!(output[width + 1], 255);
589    }
590
591    #[test]
592    fn test_threshold_with_decimation() {
593        let width = 32;
594        let height = 32;
595        let mut data = vec![200u8; width * height];
596        // Draw a black square (16x16) in the center
597        for y in 8..24 {
598            for x in 8..24 {
599                data[y * width + x] = 50;
600            }
601        }
602
603        let img = ImageView::new(&data, width, height, width).unwrap();
604
605        // Decimate by 2 -> 16x16
606        let mut decimated_data = vec![0u8; 16 * 16];
607        let decimated_img = img
608            .decimate_to(2, &mut decimated_data)
609            .expect("decimation failed");
610
611        assert_eq!(decimated_img.width, 16);
612        assert_eq!(decimated_img.height, 16);
613        let arena = Bump::new();
614        let engine = ThresholdEngine::new();
615        let stats = engine.compute_tile_stats(&arena, &decimated_img);
616        let mut output = vec![0u8; 16 * 16];
617        engine.apply_threshold(&arena, &decimated_img, &stats, &mut output);
618
619        // At (4,4) in decimated image (which is 8,8 in original), it should be black (0)
620        assert_eq!(output[4 * 16 + 4], 0);
621        // At (0,0) in decimated image, it should be white (255)
622        assert_eq!(output[0], 255);
623    }
624
625    // ========================================================================
626    // THRESHOLD ROBUSTNESS TESTS
627    // ========================================================================
628
629    use crate::config::TagFamily;
630    use crate::test_utils::{
631        TestImageParams, generate_test_image_with_params, measure_border_integrity,
632    };
633
634    /// Test threshold preserves tag structure at varying sizes (distance proxy).
635    /// Note: AprilTag 36h11 has 8x8 cells, so 4px/bit = 32px minimum.
636    #[test]
637    fn test_threshold_preserves_tag_structure_at_varying_sizes() {
638        let canvas_size = 640;
639        // Minimum 32px for 4 pixels per bit (AprilTag 36h11 = 8x8 cells)
640        let tag_sizes = [32, 48, 64, 100, 150, 200, 300];
641
642        for tag_size in tag_sizes {
643            let params = TestImageParams {
644                family: TagFamily::AprilTag36h11,
645                id: 0,
646                tag_size,
647                canvas_size,
648                ..Default::default()
649            };
650
651            let (data, corners) = generate_test_image_with_params(&params);
652            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();
653
654            let engine = ThresholdEngine::new();
655            let arena = Bump::new();
656            let stats = engine.compute_tile_stats(&arena, &img);
657            let mut binary = vec![0u8; canvas_size * canvas_size];
658            engine.apply_threshold(&arena, &img, &stats, &mut binary);
659
660            let integrity = measure_border_integrity(&binary, canvas_size, &corners);
661
662            // For tags >= 32px (4px/bit), we expect good binarization (>50% border detected)
663            assert!(
664                integrity > 0.5,
665                "Tag size {} failed: border integrity = {:.2}% (expected >50%)",
666                tag_size,
667                integrity * 100.0
668            );
669
670            println!(
671                "Tag size {:>3}px: border integrity = {:.1}%",
672                tag_size,
673                integrity * 100.0
674            );
675        }
676    }
677
678    /// Test threshold robustness to brightness and contrast variations.
679    #[test]
680    fn test_threshold_robustness_brightness_contrast() {
681        let canvas_size = 320;
682        let tag_size = 120;
683        let brightness_offsets = [-50, -25, 0, 25, 50];
684        let contrast_scales = [0.50, 0.75, 1.0, 1.25, 1.50];
685
686        for &brightness in &brightness_offsets {
687            for &contrast in &contrast_scales {
688                let params = TestImageParams {
689                    family: TagFamily::AprilTag36h11,
690                    id: 0,
691                    tag_size,
692                    canvas_size,
693                    brightness_offset: brightness,
694                    contrast_scale: contrast,
695                    ..Default::default()
696                };
697
698                let (data, corners) = generate_test_image_with_params(&params);
699                let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();
700
701                let engine = ThresholdEngine::new();
702                let arena = Bump::new();
703                let stats = engine.compute_tile_stats(&arena, &img);
704                let mut binary = vec![0u8; canvas_size * canvas_size];
705                engine.apply_threshold(&arena, &img, &stats, &mut binary);
706
707                let integrity = measure_border_integrity(&binary, canvas_size, &corners);
708
709                // For moderate conditions, expect good integrity
710                let is_moderate = brightness.abs() <= 25 && contrast >= 0.75;
711                if is_moderate {
712                    assert!(
713                        integrity > 0.4,
714                        "Brightness {}, Contrast {:.2}: integrity {:.1}% too low",
715                        brightness,
716                        contrast,
717                        integrity * 100.0
718                    );
719                }
720
721                println!(
722                    "Brightness {:>3}, Contrast {:.2}: integrity = {:.1}%",
723                    brightness,
724                    contrast,
725                    integrity * 100.0
726                );
727            }
728        }
729    }
730
731    /// Test threshold robustness to varying noise levels.
732    #[test]
733    fn test_threshold_robustness_noise() {
734        let canvas_size = 320;
735        let tag_size = 120;
736        let noise_levels = [0.0, 5.0, 10.0, 15.0, 20.0, 30.0];
737
738        for &noise_sigma in &noise_levels {
739            let params = TestImageParams {
740                family: TagFamily::AprilTag36h11,
741                id: 0,
742                tag_size,
743                canvas_size,
744                noise_sigma,
745                ..Default::default()
746            };
747
748            let (data, corners) = generate_test_image_with_params(&params);
749            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();
750
751            let engine = ThresholdEngine::new();
752            let arena = Bump::new();
753            let stats = engine.compute_tile_stats(&arena, &img);
754            let mut binary = vec![0u8; canvas_size * canvas_size];
755            engine.apply_threshold(&arena, &img, &stats, &mut binary);
756
757            let integrity = measure_border_integrity(&binary, canvas_size, &corners);
758
759            // For noise <= 15, expect reasonable integrity
760            if noise_sigma <= 15.0 {
761                assert!(
762                    integrity > 0.45,
763                    "Noise σ={:.1}: integrity {:.1}% too low",
764                    noise_sigma,
765                    integrity * 100.0
766                );
767            }
768
769            println!(
770                "Noise σ={:>4.1}: integrity = {:.1}%",
771                noise_sigma,
772                integrity * 100.0
773            );
774        }
775    }
776
777    proptest! {
778        /// Fuzz test threshold with random combinations of parameters.
779        /// Minimum tag size 32px to ensure 4 pixels per bit.
780        #[test]
781        fn test_threshold_combined_conditions_no_panic(
782            tag_size in 32_usize..200,  // 32px = 4px/bit for AprilTag 36h11
783            brightness in -40_i16..40,
784            contrast in 0.6_f32..1.4,
785            noise in 0.0_f32..20.0
786        ) {
787            let canvas_size = 320;
788
789            // Skip invalid combinations (tag too big for canvas)
790            if tag_size >= canvas_size - 40 {
791                return Ok(());
792            }
793
794            let params = TestImageParams {
795                family: TagFamily::AprilTag36h11,
796                id: 0,
797                tag_size,
798                canvas_size,
799                noise_sigma: noise,
800                brightness_offset: brightness,
801                contrast_scale: contrast,
802            };
803
804            let (data, _corners) = generate_test_image_with_params(&params);
805            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();
806
807            let engine = ThresholdEngine::new();
808            let arena = Bump::new();
809            let stats = engine.compute_tile_stats(&arena, &img);
810            let mut binary = vec![0u8; canvas_size * canvas_size];
811
812            // Should not panic
813            engine.apply_threshold(&arena, &img, &stats, &mut binary);
814
815            // Basic sanity: output should have both black and white pixels
816            let black_count = binary.iter().filter(|&&p| p == 0).count();
817            let white_count = binary.iter().filter(|&&p| p == 255).count();
818
819            // Valid binary image should have both colors (only 0 and 255)
820            prop_assert!(black_count + white_count == binary.len(),
821                "Binary output contains non-binary values");
822
823            // With a tag present, we expect some black and white pixels
824            prop_assert!(black_count > 0, "No black pixels in output");
825            prop_assert!(white_count > 0, "No white pixels in output");
826        }
827    }
828}
829
830#[multiversion(targets = "simd")]
831fn compute_min_max_simd(data: &[u8]) -> (u8, u8) {
832    let mut min = 255u8;
833    let mut max = 0u8;
834    for &b in data {
835        min = min.min(b);
836        max = max.max(b);
837    }
838    (min, max)
839}
840
841// =============================================================================
842// INTEGRAL IMAGE-BASED ADAPTIVE THRESHOLD
843// =============================================================================
844//
845// This implements OpenCV-style ADAPTIVE_THRESH_MEAN_C using integral images:
846// 1. Compute integral image in O(W*H)
847// 2. For each pixel, compute local mean in O(1) using integral image
848// 3. Threshold: pixel < (local_mean - C) ? black : white
849//
850// This produces per-pixel adaptive thresholds for small tag detection.
851
852/// Compute integral image (cumulative sum) for fast box filter computation.
853///
854/// Uses a 2-pass parallel implementation for maximum throughput on modern multicore CPUs.
855/// The `integral` buffer must have size `(img.width + 1) * (img.height + 1)`.
856#[allow(clippy::needless_range_loop, clippy::items_after_statements)]
857#[allow(dead_code)]
858pub fn compute_integral_image(img: &ImageView, integral: &mut [u64]) {
859    let w = img.width;
860    let h = img.height;
861    let stride = w + 1;
862
863    // Zero the first row efficiently
864    for x in 0..stride {
865        integral[x] = 0;
866    }
867
868    // use rayon::prelude::*;
869
870    // 1st Pass: Compute horizontal cumulative sums (prefix sum per row)
871    // This part is perfectly parallel.
872    integral
873        .par_chunks_exact_mut(stride)
874        .enumerate()
875        .skip(1)
876        .for_each(|(y_idx, row)| {
877            let y = y_idx - 1;
878            let src_row = img.get_row(y);
879            let mut sum = 0u64;
880            // row[0] is already 0
881            for x in 0..w {
882                sum += u64::from(src_row[x]);
883                row[x + 1] = sum;
884            }
885        });
886
887    // 2nd Pass: Vertical cumulative sums
888    // For large images, we process in vertical blocks to stay in cache.
889    const BLOCK_SIZE: usize = 128;
890    let num_blocks = stride.div_ceil(BLOCK_SIZE);
891
892    (0..num_blocks).into_par_iter().for_each(|b| {
893        let start_x = b * BLOCK_SIZE;
894        let end_x = (start_x + BLOCK_SIZE).min(stride);
895
896        // Initialize cumulative sum for this column block
897        // We use a small on-stack or small-vec if needed, but since BLOCK_SIZE is small (128),
898        // we can just use a fixed-size array if we want to avoid allocation entirely.
899        let mut col_sums = [0u64; BLOCK_SIZE];
900
901        // Safety: We are writing to unique columns in each parallel task.
902        unsafe {
903            let base_ptr = integral.as_ptr().cast_mut();
904            for y in 1..=h {
905                let row_ptr = base_ptr.add(y * stride + start_x);
906                for (i, val) in col_sums.iter_mut().enumerate().take(end_x - start_x) {
907                    let old_val = *row_ptr.add(i);
908                    let new_sum = old_val + *val;
909                    *row_ptr.add(i) = new_sum;
910                    *val = new_sum;
911                }
912            }
913        }
914    });
915}
916
917/// Apply per-pixel adaptive threshold using integral image.
918///
919/// Optimized with parallel processing and branchless thresholding.
920#[multiversion(targets = "simd")]
921/// Apply per-pixel adaptive threshold using integral image.
922///
923/// Optimized with parallel processing, interior-loop vectorization, and fixed-point arithmetic.
924#[multiversion(targets(
925    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
926    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
927    "aarch64+neon"
928))]
929#[tracing::instrument(skip_all, name = "pipeline::threshold_integral")]
930pub fn adaptive_threshold_integral(
931    img: &ImageView,
932    integral: &[u64],
933    output: &mut [u8],
934    radius: usize,
935    c: i16,
936) {
937    let w = img.width;
938    let h = img.height;
939    let stride = w + 1;
940
941    // use rayon::prelude::*;
942
943    // Precompute interior area inverse (fixed-point 1.31)
944    let side = (2 * radius + 1) as u32;
945    let area = side * side;
946    let inv_area_fixed = ((1u64 << 31) / u64::from(area)) as u32;
947
948    (0..h).into_par_iter().for_each(|y| {
949        let y_offset = y * w;
950        let src_row = img.get_row(y);
951
952        // Safety: Unique row per thread
953        let dst_row = unsafe {
954            let ptr = output.as_ptr().cast_mut();
955            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
956        };
957
958        let y0 = y.saturating_sub(radius);
959        let y1 = (y + radius + 1).min(h);
960
961        // Define interior region for this row
962        let x_start = radius;
963        let x_end = w.saturating_sub(radius + 1);
964
965        // 1. Process Left Border
966        for x in 0..x_start.min(w) {
967            let x0 = 0; // saturating_sub(radius) is 0
968            let x1 = (x + radius + 1).min(w);
969            let actual_area = (x1 - x0) * (y1 - y0);
970
971            let i00 = integral[y0 * stride + x0];
972            let i01 = integral[y0 * stride + x1];
973            let i10 = integral[y1 * stride + x0];
974            let i11 = integral[y1 * stride + x1];
975
976            let sum = (i11 + i00) - (i01 + i10);
977            let mean = (sum / actual_area as u64) as i16;
978            let threshold = (mean - c).max(0) as u8;
979            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
980        }
981
982        // 2. Process Interior (Vectorizable)
983        if x_end > x_start && y >= radius && y + radius < h {
984            let row00 = &integral[y0 * stride + (x_start - radius)..];
985            let row01 = &integral[y0 * stride + (x_start + radius + 1)..];
986            let row10 = &integral[y1 * stride + (x_start - radius)..];
987            let row11 = &integral[y1 * stride + (x_start + radius + 1)..];
988
989            let interior_src = &src_row[x_start..x_end];
990            let interior_dst = &mut dst_row[x_start..x_end];
991
992            for i in 0..(x_end - x_start) {
993                let sum = (row11[i] + row00[i]) - (row01[i] + row10[i]);
994                // Fixed-point division: (sum * inv_area) >> 31
995                let mean = ((sum * u64::from(inv_area_fixed)) >> 31) as i16;
996                let threshold = (mean - c).max(0) as u8;
997                interior_dst[i] = if interior_src[i] < threshold { 0 } else { 255 };
998            }
999        } else if x_end > x_start {
1000            // Interior X but border Y
1001            for x in x_start..x_end {
1002                let x0 = x - radius;
1003                let x1 = x + radius + 1;
1004                let actual_area = (x1 - x0) * (y1 - y0);
1005
1006                let i00 = integral[y0 * stride + x0];
1007                let i01 = integral[y0 * stride + x1];
1008                let i10 = integral[y1 * stride + x0];
1009                let i11 = integral[y1 * stride + x1];
1010
1011                let sum = (i11 + i00) - (i01 + i10);
1012                let mean = (sum / actual_area as u64) as i16;
1013                let threshold = (mean - c).max(0) as u8;
1014                dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
1015            }
1016        }
1017
1018        // 3. Process Right Border
1019        for x in x_end.max(x_start)..w {
1020            let x0 = x.saturating_sub(radius);
1021            let x1 = w; // (x + radius + 1).min(w)
1022            let actual_area = (x1 - x0) * (y1 - y0);
1023
1024            let i00 = integral[y0 * stride + x0];
1025            let i01 = integral[y0 * stride + x1];
1026            let i10 = integral[y1 * stride + x0];
1027            let i11 = integral[y1 * stride + x1];
1028
1029            let sum = (i11 + i00) - (i01 + i10);
1030            let mean = (sum / actual_area as u64) as i16;
1031            let threshold = (mean - c).max(0) as u8;
1032            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
1033        }
1034    });
1035}
1036
1037/// Fast adaptive threshold combining integral image approach with SIMD.
1038///
1039/// This is the main entry point for performance-oriented adaptive thresholding:
1040/// - Computes integral image once
1041/// - Applies per-pixel adaptive threshold with local mean
1042/// - Uses default parameters tuned for AprilTag detection
1043#[allow(dead_code)]
1044pub(crate) fn apply_adaptive_threshold_fast(img: &ImageView, output: &mut [u8]) {
1045    // OpenCV uses blockSize=13 (radius=6) and C=3 as good defaults
1046    apply_adaptive_threshold_with_params(img, output, 6, 3);
1047}
1048
1049/// Adaptive threshold with custom parameters.
1050#[allow(dead_code)]
1051pub(crate) fn apply_adaptive_threshold_with_params(
1052    img: &ImageView,
1053    output: &mut [u8],
1054    radius: usize,
1055    c: i16,
1056) {
1057    let mut integral = vec![0u64; (img.width + 1) * (img.height + 1)];
1058    compute_integral_image(img, &mut integral);
1059    adaptive_threshold_integral(img, &integral, output, radius, c);
1060}
1061
1062/// Apply per-pixel adaptive threshold with gradient-based window sizing.
1063///
1064/// Highly optimized using Parallel processing, precomputed LUTs, and branchless logic.
1065#[allow(clippy::too_many_arguments)]
1066#[multiversion(targets(
1067    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
1068    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
1069    "aarch64+neon"
1070))]
1071#[tracing::instrument(skip_all, name = "pipeline::threshold_gradient_window")]
1072pub fn adaptive_threshold_gradient_window(
1073    img: &ImageView,
1074    gradient_map: &[u8],
1075    integral: &[u64],
1076    output: &mut [u8],
1077    min_radius: usize,
1078    max_radius: usize,
1079    gradient_threshold: u8,
1080    c: i16,
1081) {
1082    let w = img.width;
1083    let h = img.height;
1084    let stride = w + 1;
1085
1086    // Precompute radius and area reciprocal LUTs (fixed-point 1.31)
1087    let mut radius_lut = [0usize; 256];
1088    let mut inv_area_lut = [0u32; 256];
1089    let grad_thresh_f32 = f32::from(gradient_threshold);
1090
1091    for g in 0..256 {
1092        let r = if g as u8 >= gradient_threshold {
1093            min_radius
1094        } else {
1095            let t = g as f32 / grad_thresh_f32;
1096            let r = max_radius as f32 * (1.0 - t) + min_radius as f32 * t;
1097            r as usize
1098        };
1099        radius_lut[g] = r;
1100        let side = (2 * r + 1) as u32;
1101        let area = side * side;
1102        inv_area_lut[g] = ((1u64 << 31) / u64::from(area)) as u32;
1103    }
1104
1105    // use rayon::prelude::*;
1106
1107    (0..h).into_par_iter().for_each(|y| {
1108        let y_offset = y * w;
1109        let src_row = img.get_row(y);
1110
1111        // Safety: Unique row per thread
1112        let dst_row = unsafe {
1113            let ptr = output.as_ptr().cast_mut();
1114            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
1115        };
1116
1117        for x in 0..w {
1118            let grad = gradient_map[y_offset + x];
1119            let radius = radius_lut[grad as usize];
1120
1121            let y0 = y.saturating_sub(radius);
1122            let y1 = (y + radius + 1).min(h);
1123            let x0 = x.saturating_sub(radius);
1124            let x1 = (x + radius + 1).min(w);
1125
1126            let i00 = integral[y0 * stride + x0];
1127            let i01 = integral[y0 * stride + x1];
1128            let i10 = integral[y1 * stride + x0];
1129            let i11 = integral[y1 * stride + x1];
1130
1131            let sum = (i11 + i00) - (i01 + i10);
1132
1133            // Fixed-point mean computation
1134            let mean = if x >= radius && x + radius < w && y >= radius && y + radius < h {
1135                ((sum * u64::from(inv_area_lut[grad as usize])) >> 31) as i16
1136            } else {
1137                let actual_area = (x1 - x0) * (y1 - y0);
1138                (sum / actual_area as u64) as i16
1139            };
1140
1141            let threshold = (mean - c).max(0) as u8;
1142            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
1143        }
1144    });
1145}
1146
1147/// Compute a map of local mean values.
1148///
1149/// Optimized with parallelism and vectorization.
1150#[multiversion(targets(
1151    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
1152    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
1153    "aarch64+neon"
1154))]
1155#[allow(clippy::cast_sign_loss, clippy::needless_range_loop)]
1156pub(crate) fn compute_threshold_map(
1157    img: &ImageView,
1158    integral: &[u64],
1159    output: &mut [u8],
1160    radius: usize,
1161    c: i16,
1162) {
1163    let w = img.width;
1164    let h = img.height;
1165    let stride = w + 1;
1166
1167    // use rayon::prelude::*; // Already imported at module level
1168
1169    // Precompute interior area inverse
1170    let side = (2 * radius + 1) as u32;
1171    let area = side * side;
1172    let inv_area_fixed = ((1u64 << 31) / u64::from(area)) as u32;
1173
1174    (0..h).into_par_iter().for_each(|y| {
1175        let y_offset = y * w;
1176
1177        // Safety: Unique row per thread
1178        let dst_row = unsafe {
1179            let ptr = output.as_ptr().cast_mut();
1180            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
1181        };
1182
1183        let y0 = y.saturating_sub(radius);
1184        let y1 = (y + radius + 1).min(h);
1185
1186        let x_start = radius;
1187        let x_end = w.saturating_sub(radius + 1);
1188
1189        // 1. Process Left Border
1190        for x in 0..x_start.min(w) {
1191            let x0 = 0;
1192            let x1 = (x + radius + 1).min(w);
1193            let actual_area = (x1 - x0) * (y1 - y0);
1194            let sum = (integral[y1 * stride + x1] + integral[y0 * stride + x0])
1195                - (integral[y0 * stride + x1] + integral[y1 * stride + x0]);
1196            let mean = (sum / actual_area as u64) as i16;
1197            dst_row[x] = (mean - c).clamp(0, 255) as u8;
1198        }
1199
1200        // 2. Process Interior (Vectorizable)
1201        if x_end > x_start && y >= radius && y + radius < h {
1202            let row00 = &integral[y0 * stride + (x_start - radius)..];
1203            let row01 = &integral[y0 * stride + (x_start + radius + 1)..];
1204            let row10 = &integral[y1 * stride + (x_start - radius)..];
1205            let row11 = &integral[y1 * stride + (x_start + radius + 1)..];
1206
1207            let interior_dst = &mut dst_row[x_start..x_end];
1208
1209            for i in 0..(x_end - x_start) {
1210                let sum = (row11[i] + row00[i]) - (row01[i] + row10[i]);
1211                let mean = ((sum * u64::from(inv_area_fixed)) >> 31) as i16;
1212                interior_dst[i] = (mean - c).clamp(0, 255) as u8;
1213            }
1214        }
1215
1216        // 3. Process Right Border
1217        for x in x_end.max(x_start)..w {
1218            let x0 = x.saturating_sub(radius);
1219            let x1 = w;
1220            let actual_area = (x1 - x0) * (y1 - y0);
1221            let sum = (integral[y1 * stride + x1] + integral[y0 * stride + x0])
1222                - (integral[y0 * stride + x1] + integral[y1 * stride + x0]);
1223            let mean = (sum / actual_area as u64) as i16;
1224            dst_row[x] = (mean - c).clamp(0, 255) as u8;
1225        }
1226    });
1227}