Skip to main content

locus_core/
threshold.rs

1//! Adaptive thresholding and image binarization.
2//!
3//! This module provides the first stage of the detection pipeline, converting grayscale
4//! input into binary images while adapting to local lighting conditions.
5//!
6//! It features two primary implementations:
7//! 1. **Tile-based Thresholding**: Blazing fast approach using local tile stats.
8//! 2. **Integral Image Thresholding**: Per-pixel adaptive thresholding for small features.
9
10#![allow(unsafe_code, clippy::cast_sign_loss)]
11use crate::config::DetectorConfig;
12use crate::image::ImageView;
13use bumpalo::Bump;
14use bumpalo::collections::Vec as BumpVec;
15use multiversion::multiversion;
16use rayon::prelude::*;
17
18/// Statistics for a single tile.
19#[derive(Clone, Copy, Debug, Default)]
20pub struct TileStats {
21    /// Minimum pixel value in the tile.
22    pub min: u8,
23    /// Maximum pixel value in the tile.
24    pub max: u8,
25}
26
27/// Adaptive thresholding engine using tile-based stats.
28pub struct ThresholdEngine {
29    /// Size of the tiles used for local thresholding statistics.
30    pub tile_size: usize,
31    /// Minimum intensity range for a tile to be considered valid.
32    pub min_range: u8,
33}
34
35impl Default for ThresholdEngine {
36    fn default() -> Self {
37        Self::new()
38    }
39}
40
41impl ThresholdEngine {
42    /// Create a new ThresholdEngine with default settings.
43    #[must_use]
44    pub fn new() -> Self {
45        Self {
46            tile_size: 4, // Smaller tiles for sub-10px tag support // Standard 8x8 tiles
47            min_range: 5, // Lower for low-contrast edge detection
48        }
49    }
50
51    /// Create a ThresholdEngine from detector configuration.
52    #[must_use]
53    pub fn from_config(config: &DetectorConfig) -> Self {
54        Self {
55            tile_size: config.threshold_tile_size,
56            min_range: config.threshold_min_range,
57        }
58    }
59
60    /// Compute min/max statistics for each tile in the image.
61    /// Optimized with SIMD-friendly memory access patterns and subsampling (stride 2).
62    #[must_use]
63    pub fn compute_tile_stats<'a>(
64        &self,
65        arena: &'a Bump,
66        img: &ImageView,
67    ) -> BumpVec<'a, TileStats> {
68        let ts = self.tile_size;
69        let tiles_wide = img.width / ts;
70        let tiles_high = img.height / ts;
71        let mut stats = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
72        stats.resize(tiles_wide * tiles_high, TileStats { min: 255, max: 0 });
73
74        stats
75            .par_chunks_mut(tiles_wide)
76            .enumerate()
77            .for_each(|(ty, stats_row)| {
78                // Subsampling: Only process every other row within a tile (stride 2)
79                // This statistically approximates the min/max sufficient for thresholding
80                for dy in 0..ts {
81                    let py = ty * ts + dy;
82                    let src_row = img.get_row(py);
83
84                    // Process all tiles in this row with SIMD-friendly min/max
85                    compute_row_tile_stats_simd(src_row, stats_row, ts);
86                }
87            });
88        stats
89    }
90
91    /// Apply adaptive thresholding to the image.
92    /// Optimized with pre-expanded threshold maps and vectorized row processing.
93    #[allow(clippy::too_many_lines, clippy::needless_range_loop)]
94    pub fn apply_threshold(
95        &self,
96        arena: &Bump,
97        img: &ImageView,
98        stats: &[TileStats],
99        output: &mut [u8],
100    ) {
101        let ts = self.tile_size;
102        let tiles_wide = img.width / ts;
103        let tiles_high = img.height / ts;
104
105        let mut tile_thresholds = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
106        tile_thresholds.resize(tiles_wide * tiles_high, 0u8);
107        let mut tile_valid = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
108        tile_valid.resize(tiles_wide * tiles_high, 0u8);
109
110        tile_thresholds
111            .par_chunks_mut(tiles_wide)
112            .enumerate()
113            .for_each(|(ty, t_row)| {
114                let y_start = ty.saturating_sub(1);
115                let y_end = (ty + 1).min(tiles_high - 1);
116
117                for tx in 0..tiles_wide {
118                    let mut nmin = 255u8;
119                    let mut nmax = 0u8;
120
121                    let x_start = tx.saturating_sub(1);
122                    let x_end = (tx + 1).min(tiles_wide - 1);
123
124                    for ny in y_start..=y_end {
125                        let row_off = ny * tiles_wide;
126                        for nx in x_start..=x_end {
127                            let s = stats[row_off + nx];
128                            if s.min < nmin {
129                                nmin = s.min;
130                            }
131                            if s.max > nmax {
132                                nmax = s.max;
133                            }
134                        }
135                    }
136
137                    let t_idx = tx;
138                    let res = ((u16::from(nmin) + u16::from(nmax)) >> 1) as u8;
139
140                    // Safety: Unique index per thread for tile_valid access via raw pointer to avoid multiple mutable borrows of the same slice
141                    // However, tile_valid is not yet parallelized here. Let's fix that.
142                    t_row[t_idx] = res;
143                }
144            });
145
146        // Compute tile_valid (can be done in same loop above or separate)
147        for ty in 0..tiles_high {
148            for tx in 0..tiles_wide {
149                let mut nmin = 255;
150                let mut nmax = 0;
151                let y_start = ty.saturating_sub(1);
152                let y_end = (ty + 1).min(tiles_high - 1);
153                let x_start = tx.saturating_sub(1);
154                let x_end = (tx + 1).min(tiles_wide - 1);
155
156                for ny in y_start..=y_end {
157                    let row_off = ny * tiles_wide;
158                    for nx in x_start..=x_end {
159                        let s = stats[row_off + nx];
160                        if s.min < nmin {
161                            nmin = s.min;
162                        }
163                        if s.max > nmax {
164                            nmax = s.max;
165                        }
166                    }
167                }
168                let idx = ty * tiles_wide + tx;
169                tile_valid[idx] = if nmax.saturating_sub(nmin) < self.min_range {
170                    0
171                } else {
172                    255
173                };
174            }
175        }
176
177        // --- Propagation Pass ---
178        // Fill thresholds for invalid tiles from their neighbors to stay
179        // consistent within large uniform regions.
180        for _ in 0..2 {
181            // 2 iterations are usually enough for local consistency
182            for ty in 0..tiles_high {
183                for tx in 0..tiles_wide {
184                    let idx = ty * tiles_wide + tx;
185                    if tile_valid[idx] == 0 {
186                        let mut sum_thresh = 0u32;
187                        let mut count = 0u32;
188
189                        let y_start = ty.saturating_sub(1);
190                        let y_end = (ty + 1).min(tiles_high - 1);
191                        let x_start = tx.saturating_sub(1);
192                        let x_end = (tx + 1).min(tiles_wide - 1);
193
194                        for ny in y_start..=y_end {
195                            let row_off = ny * tiles_wide;
196                            for nx in x_start..=x_end {
197                                let n_idx = row_off + nx;
198                                if tile_valid[n_idx] > 0 {
199                                    sum_thresh += u32::from(tile_thresholds[n_idx]);
200                                    count += 1;
201                                }
202                            }
203                        }
204
205                        if count > 0 {
206                            tile_thresholds[idx] = (sum_thresh / count) as u8;
207                            tile_valid[idx] = 128; // Partial valid (propagated)
208                        }
209                    }
210                }
211            }
212        }
213
214        let thresholds_slice = tile_thresholds.as_slice();
215        let valid_slice = tile_valid.as_slice();
216
217        output
218            .par_chunks_mut(ts * img.width)
219            .enumerate()
220            .for_each_init(
221                || (vec![0u8; img.width], vec![0u8; img.width]),
222                |(row_thresholds, row_valid), (ty, output_tile_rows)| {
223                    if ty >= tiles_high {
224                        return;
225                    }
226                    // Reset buffers? No, they are fully overwritten below by the loops over tiles and pixels.
227                    // Just ensure they are correct size if they were hypothetically resized (they are not).
228                    // Actually, the loop logic depends on writing to `row_thresholds` and `row_valid` at specific indices.
229                    // The loops:
230                    // for tx in 0..tiles_wide ... row_thresholds[tx*ts+i] = ...
231                    // This covers [0 .. tiles_wide*ts].
232                    // If tiles_wide * ts < img.width (e.g. edge cases), remaining pixels might be stale.
233                    // However, the original code `vec![0u8; img.width]` zero-inits.
234                    // The previous loop logic:
235                    // `for tx in 0..tiles_wide` covers up to `tiles_wide * ts`.
236                    // `img.width` might be slightly larger than `tiles_wide * ts`.
237                    // The original code zero-initialized the WHOLE vector.
238                    // To maintain correctness, we should zero-init (or fill with default) the buffers, or at least the tail.
239                    // Or better, just fill them entirely if cheap, or rely on logic correctness.
240                    // Let's look at `threshold_row_simd`. It reads `thresholds[i]` and `valid_mask[i]`.
241                    // It iterates `0..len` where len is `src.len()` (== img.width).
242                    // If `row_thresholds` has stale garbage at the end, `threshold_row_simd` will use it.
243                    // The original code produced 0s at the end (due to `vec![0u8; ...]`).
244                    // So we MUST zero-init or fill the reused buffers.
245                    // `row_thresholds.fill(0)` and `row_valid.fill(0)` is safe and fast enough (memset).
246
247                    row_thresholds.fill(0);
248                    row_valid.fill(0);
249
250                    // Expand tile stats to row buffers
251                    for tx in 0..tiles_wide {
252                        let idx = ty * tiles_wide + tx;
253                        let thresh = thresholds_slice[idx];
254                        let valid = valid_slice[idx];
255                        for i in 0..ts {
256                            row_thresholds[tx * ts + i] = thresh;
257                            row_valid[tx * ts + i] = valid;
258                        }
259                    }
260
261                    for dy in 0..ts {
262                        let py = ty * ts + dy;
263                        let src_row = img.get_row(py);
264                        let dst_row = &mut output_tile_rows[dy * img.width..(dy + 1) * img.width];
265
266                        threshold_row_simd(src_row, dst_row, row_thresholds, row_valid);
267                    }
268                },
269            );
270    }
271
272    /// Apply adaptive thresholding and return both binary and threshold maps.
273    ///
274    /// This is needed for threshold-model-aware segmentation, which uses the
275    /// per-pixel threshold values to connect pixels by their deviation sign.
276    #[allow(clippy::needless_range_loop)]
277    pub fn apply_threshold_with_map(
278        &self,
279        arena: &Bump,
280        img: &ImageView,
281        stats: &[TileStats],
282        binary_output: &mut [u8],
283        threshold_output: &mut [u8],
284    ) {
285        let ts = self.tile_size;
286        let tiles_wide = img.width / ts;
287        let tiles_high = img.height / ts;
288
289        let mut tile_thresholds = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
290        tile_thresholds.resize(tiles_wide * tiles_high, 0u8);
291        let mut tile_valid = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
292        tile_valid.resize(tiles_wide * tiles_high, 0u8);
293
294        tile_thresholds
295            .par_chunks_mut(tiles_wide)
296            .enumerate()
297            .for_each(|(ty, t_row)| {
298                let y_start = ty.saturating_sub(1);
299                let y_end = (ty + 1).min(tiles_high - 1);
300
301                for tx in 0..tiles_wide {
302                    let mut nmin = 255u8;
303                    let mut nmax = 0u8;
304
305                    let x_start = tx.saturating_sub(1);
306                    let x_end = (tx + 1).min(tiles_wide - 1);
307
308                    for ny in y_start..=y_end {
309                        let row_off = ny * tiles_wide;
310                        for nx in x_start..=x_end {
311                            let s = stats[row_off + nx];
312                            if s.min < nmin {
313                                nmin = s.min;
314                            }
315                            if s.max > nmax {
316                                nmax = s.max;
317                            }
318                        }
319                    }
320
321                    t_row[tx] = ((u16::from(nmin) + u16::from(nmax)) >> 1) as u8;
322                }
323            });
324
325        // Compute tile_valid
326        for ty in 0..tiles_high {
327            for tx in 0..tiles_wide {
328                let mut nmin = 255;
329                let mut nmax = 0;
330                let y_start = ty.saturating_sub(1);
331                let y_end = (ty + 1).min(tiles_high - 1);
332                let x_start = tx.saturating_sub(1);
333                let x_end = (tx + 1).min(tiles_wide - 1);
334
335                for ny in y_start..=y_end {
336                    let row_off = ny * tiles_wide;
337                    for nx in x_start..=x_end {
338                        let s = stats[row_off + nx];
339                        if s.min < nmin {
340                            nmin = s.min;
341                        }
342                        if s.max > nmax {
343                            nmax = s.max;
344                        }
345                    }
346                }
347                let idx = ty * tiles_wide + tx;
348                tile_valid[idx] = if nmax.saturating_sub(nmin) < self.min_range {
349                    0
350                } else {
351                    255
352                };
353            }
354        }
355
356        // Propagation pass
357        /*
358        for _ in 0..2 {
359            for ty in 0..tiles_high {
360                for tx in 0..tiles_wide {
361                    let idx = ty * tiles_wide + tx;
362                    if tile_valid[idx] == 0 {
363                        let mut sum_thresh = 0u32;
364                        let mut count = 0u32;
365
366                        let y_start = ty.saturating_sub(1);
367                        let y_end = (ty + 1).min(tiles_high - 1);
368                        let x_start = tx.saturating_sub(1);
369                        let x_end = (tx + 1).min(tiles_wide - 1);
370
371                        for ny in y_start..=y_end {
372                            let row_off = ny * tiles_wide;
373                            for nx in x_start..=x_end {
374                                let n_idx = row_off + nx;
375                                if tile_valid[n_idx] > 0 {
376                                    sum_thresh += u32::from(tile_thresholds[n_idx]);
377                                    count += 1;
378                                }
379                            }
380                        }
381
382                        if count > 0 {
383                            tile_thresholds[idx] = (sum_thresh / count) as u8;
384                            tile_valid[idx] = 128;
385                        }
386                    }
387                }
388            }
389        }
390        */
391
392        // Write thresholds and binary output in parallel
393        let thresholds_slice = tile_thresholds.as_slice();
394        let valid_slice = tile_valid.as_slice();
395
396        binary_output
397            .par_chunks_mut(ts * img.width)
398            .enumerate()
399            .for_each_init(
400                || (vec![0u8; img.width], vec![0u8; img.width]),
401                |(row_thresholds, row_valid), (ty, bin_tile_rows)| {
402                    if ty >= tiles_high {
403                        return;
404                    }
405                    // Safety: Each thread writes to unique portion of threshold_output
406                    let thresh_tile_rows = unsafe {
407                        let ptr = threshold_output.as_ptr().cast_mut();
408                        std::slice::from_raw_parts_mut(ptr.add(ty * ts * img.width), ts * img.width)
409                    };
410
411                    row_thresholds.fill(0);
412                    row_valid.fill(0);
413
414                    for tx in 0..tiles_wide {
415                        let idx = ty * tiles_wide + tx;
416                        let thresh = thresholds_slice[idx];
417                        let valid = valid_slice[idx];
418                        for i in 0..ts {
419                            row_thresholds[tx * ts + i] = thresh;
420                            row_valid[tx * ts + i] = valid;
421                        }
422                    }
423
424                    for dy in 0..ts {
425                        let py = ty * ts + dy;
426                        let src_row = img.get_row(py);
427
428                        // Write binary output
429                        let bin_row = &mut bin_tile_rows[dy * img.width..(dy + 1) * img.width];
430                        threshold_row_simd(src_row, bin_row, row_thresholds, row_valid);
431
432                        // Write threshold map
433                        thresh_tile_rows[dy * img.width..(dy + 1) * img.width]
434                            .copy_from_slice(row_thresholds);
435                    }
436                },
437            );
438    }
439}
440
441#[multiversion(targets(
442    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
443    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
444    "aarch64+neon"
445))]
446fn compute_row_tile_stats_simd(src_row: &[u8], stats: &mut [TileStats], tile_size: usize) {
447    let chunks = src_row.chunks_exact(tile_size);
448    for (chunk, stat) in chunks.zip(stats.iter_mut()) {
449        let mut rmin = stat.min;
450        let mut rmax = stat.max;
451
452        // Hint to compiler for vectorization
453        for &p in chunk {
454            rmin = rmin.min(p);
455            rmax = rmax.max(p);
456        }
457
458        stat.min = rmin;
459        stat.max = rmax;
460    }
461}
462
463#[multiversion(targets(
464    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
465    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
466    "aarch64+neon"
467))]
468fn threshold_row_simd(src: &[u8], dst: &mut [u8], thresholds: &[u8], valid_mask: &[u8]) {
469    let len = src.len();
470    // Use chunks_exact to help compiler vectorize (e.g. into 16 or 32-byte chunks)
471    let src_chunks = src.chunks_exact(16);
472    let dst_chunks = dst.chunks_exact_mut(16);
473    let thresh_chunks = thresholds.chunks_exact(16);
474    let mask_chunks = valid_mask.chunks_exact(16);
475
476    for (((s_c, d_c), t_c), m_c) in src_chunks
477        .zip(dst_chunks)
478        .zip(thresh_chunks)
479        .zip(mask_chunks)
480    {
481        for i in 0..16 {
482            let s = s_c[i];
483            let t = t_c[i];
484            let v = m_c[i];
485            // Nested if structure often helps compiler with CMV/Masks
486            d_c[i] = if v > 0 {
487                if s < t { 0 } else { 255 }
488            } else {
489                255
490            };
491        }
492    }
493
494    // Handle tail
495    let processed = (len / 16) * 16;
496    for i in processed..len {
497        let s = src[i];
498        let t = thresholds[i];
499        let v = valid_mask[i];
500        dst[i] = if v > 0 {
501            if s < t { 0 } else { 255 }
502        } else {
503            255
504        };
505    }
506}
507
508#[cfg(test)]
509#[allow(
510    clippy::unwrap_used,
511    clippy::naive_bytecount,
512    clippy::cast_possible_truncation,
513    clippy::cast_sign_loss
514)]
515mod tests {
516    use super::*;
517    use proptest::prelude::*;
518
519    proptest! {
520        #[test]
521        fn test_threshold_invariants(data in prop::collection::vec(0..=255u8, 16)) {
522            let mut min = 255u8;
523            let mut max = 0u8;
524            for &b in &data {
525                if b < min { min = b; }
526                if b > max { max = b; }
527            }
528            let (rmin, rmax) = compute_min_max_simd(&data);
529            assert_eq!(rmin, min);
530            assert_eq!(rmax, max);
531        }
532
533        #[test]
534        fn test_binarization_invariants(src in prop::collection::vec(0..=255u8, 16), thresh in 0..=255u8) {
535            let mut dst = vec![0u8; 16];
536            let valid = vec![255u8; 16];
537            threshold_row_simd(&src, &mut dst, &[thresh; 16], &valid);
538
539            for (i, &s) in src.iter().enumerate() {
540                if s >= thresh {
541                    assert_eq!(dst[i], 255);
542                } else {
543                    assert_eq!(dst[i], 0);
544                }
545            }
546
547            // Test invalid tile (mask=0) should be white (255)
548            let invalid = vec![0u8; 16];
549            threshold_row_simd(&src, &mut dst, &[thresh; 16], &invalid);
550            for d in dst {
551                assert_eq!(d, 255);
552            }
553        }
554    }
555
556    #[test]
557    fn test_threshold_engine_e2e() {
558        let width = 16;
559        let height = 16;
560        let mut data = vec![128u8; width * height];
561        // Draw a black square in a white background area to test adaptive threshold
562        for y in 4..12 {
563            for x in 4..12 {
564                data[y * width + x] = 50;
565            }
566        }
567        for y in 0..height {
568            for x in 0..width {
569                if !(2..=14).contains(&x) || !(2..=14).contains(&y) {
570                    data[y * width + x] = 200;
571                }
572            }
573        }
574
575        let img = ImageView::new(&data, width, height, width).unwrap();
576        let engine = ThresholdEngine::new();
577        let arena = Bump::new();
578        let stats = engine.compute_tile_stats(&arena, &img);
579        let mut output = vec![0u8; width * height];
580        engine.apply_threshold(&arena, &img, &stats, &mut output);
581
582        // At (8,8), it should be black (0) because it's 50 and thresh should be around (50+200)/2 = 125
583        assert_eq!(output[8 * width + 8], 0);
584        // At (1,1), it should be white (255)
585        assert_eq!(output[width + 1], 255);
586    }
587
588    #[test]
589    fn test_threshold_with_decimation() {
590        let width = 32;
591        let height = 32;
592        let mut data = vec![200u8; width * height];
593        // Draw a black square (16x16) in the center
594        for y in 8..24 {
595            for x in 8..24 {
596                data[y * width + x] = 50;
597            }
598        }
599
600        let img = ImageView::new(&data, width, height, width).unwrap();
601
602        // Decimate by 2 -> 16x16
603        let mut decimated_data = vec![0u8; 16 * 16];
604        let decimated_img = img
605            .decimate_to(2, &mut decimated_data)
606            .expect("decimation failed");
607
608        assert_eq!(decimated_img.width, 16);
609        assert_eq!(decimated_img.height, 16);
610        let arena = Bump::new();
611        let engine = ThresholdEngine::new();
612        let stats = engine.compute_tile_stats(&arena, &decimated_img);
613        let mut output = vec![0u8; 16 * 16];
614        engine.apply_threshold(&arena, &decimated_img, &stats, &mut output);
615
616        // At (4,4) in decimated image (which is 8,8 in original), it should be black (0)
617        assert_eq!(output[4 * 16 + 4], 0);
618        // At (0,0) in decimated image, it should be white (255)
619        assert_eq!(output[0], 255);
620    }
621
622    // ========================================================================
623    // THRESHOLD ROBUSTNESS TESTS
624    // ========================================================================
625
626    use crate::config::TagFamily;
627    use crate::test_utils::{
628        TestImageParams, generate_test_image_with_params, measure_border_integrity,
629    };
630
631    /// Test threshold preserves tag structure at varying sizes (distance proxy).
632    /// Note: AprilTag 36h11 has 8x8 cells, so 4px/bit = 32px minimum.
633    #[test]
634    fn test_threshold_preserves_tag_structure_at_varying_sizes() {
635        let canvas_size = 640;
636        // Minimum 32px for 4 pixels per bit (AprilTag 36h11 = 8x8 cells)
637        let tag_sizes = [32, 48, 64, 100, 150, 200, 300];
638
639        for tag_size in tag_sizes {
640            let params = TestImageParams {
641                family: TagFamily::AprilTag36h11,
642                id: 0,
643                tag_size,
644                canvas_size,
645                ..Default::default()
646            };
647
648            let (data, corners) = generate_test_image_with_params(&params);
649            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();
650
651            let engine = ThresholdEngine::new();
652            let arena = Bump::new();
653            let stats = engine.compute_tile_stats(&arena, &img);
654            let mut binary = vec![0u8; canvas_size * canvas_size];
655            engine.apply_threshold(&arena, &img, &stats, &mut binary);
656
657            let integrity = measure_border_integrity(&binary, canvas_size, &corners);
658
659            // For tags >= 32px (4px/bit), we expect good binarization (>50% border detected)
660            assert!(
661                integrity > 0.5,
662                "Tag size {} failed: border integrity = {:.2}% (expected >50%)",
663                tag_size,
664                integrity * 100.0
665            );
666
667            println!(
668                "Tag size {:>3}px: border integrity = {:.1}%",
669                tag_size,
670                integrity * 100.0
671            );
672        }
673    }
674
675    /// Test threshold robustness to brightness and contrast variations.
676    #[test]
677    fn test_threshold_robustness_brightness_contrast() {
678        let canvas_size = 320;
679        let tag_size = 120;
680        let brightness_offsets = [-50, -25, 0, 25, 50];
681        let contrast_scales = [0.50, 0.75, 1.0, 1.25, 1.50];
682
683        for &brightness in &brightness_offsets {
684            for &contrast in &contrast_scales {
685                let params = TestImageParams {
686                    family: TagFamily::AprilTag36h11,
687                    id: 0,
688                    tag_size,
689                    canvas_size,
690                    brightness_offset: brightness,
691                    contrast_scale: contrast,
692                    ..Default::default()
693                };
694
695                let (data, corners) = generate_test_image_with_params(&params);
696                let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();
697
698                let engine = ThresholdEngine::new();
699                let arena = Bump::new();
700                let stats = engine.compute_tile_stats(&arena, &img);
701                let mut binary = vec![0u8; canvas_size * canvas_size];
702                engine.apply_threshold(&arena, &img, &stats, &mut binary);
703
704                let integrity = measure_border_integrity(&binary, canvas_size, &corners);
705
706                // For moderate conditions, expect good integrity
707                let is_moderate = brightness.abs() <= 25 && contrast >= 0.75;
708                if is_moderate {
709                    assert!(
710                        integrity > 0.4,
711                        "Brightness {}, Contrast {:.2}: integrity {:.1}% too low",
712                        brightness,
713                        contrast,
714                        integrity * 100.0
715                    );
716                }
717
718                println!(
719                    "Brightness {:>3}, Contrast {:.2}: integrity = {:.1}%",
720                    brightness,
721                    contrast,
722                    integrity * 100.0
723                );
724            }
725        }
726    }
727
728    /// Test threshold robustness to varying noise levels.
729    #[test]
730    fn test_threshold_robustness_noise() {
731        let canvas_size = 320;
732        let tag_size = 120;
733        let noise_levels = [0.0, 5.0, 10.0, 15.0, 20.0, 30.0];
734
735        for &noise_sigma in &noise_levels {
736            let params = TestImageParams {
737                family: TagFamily::AprilTag36h11,
738                id: 0,
739                tag_size,
740                canvas_size,
741                noise_sigma,
742                ..Default::default()
743            };
744
745            let (data, corners) = generate_test_image_with_params(&params);
746            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();
747
748            let engine = ThresholdEngine::new();
749            let arena = Bump::new();
750            let stats = engine.compute_tile_stats(&arena, &img);
751            let mut binary = vec![0u8; canvas_size * canvas_size];
752            engine.apply_threshold(&arena, &img, &stats, &mut binary);
753
754            let integrity = measure_border_integrity(&binary, canvas_size, &corners);
755
756            // For noise <= 15, expect reasonable integrity
757            if noise_sigma <= 15.0 {
758                assert!(
759                    integrity > 0.45,
760                    "Noise σ={:.1}: integrity {:.1}% too low",
761                    noise_sigma,
762                    integrity * 100.0
763                );
764            }
765
766            println!(
767                "Noise σ={:>4.1}: integrity = {:.1}%",
768                noise_sigma,
769                integrity * 100.0
770            );
771        }
772    }
773
774    proptest! {
775        /// Fuzz test threshold with random combinations of parameters.
776        /// Minimum tag size 32px to ensure 4 pixels per bit.
777        #[test]
778        fn test_threshold_combined_conditions_no_panic(
779            tag_size in 32_usize..200,  // 32px = 4px/bit for AprilTag 36h11
780            brightness in -40_i16..40,
781            contrast in 0.6_f32..1.4,
782            noise in 0.0_f32..20.0
783        ) {
784            let canvas_size = 320;
785
786            // Skip invalid combinations (tag too big for canvas)
787            if tag_size >= canvas_size - 40 {
788                return Ok(());
789            }
790
791            let params = TestImageParams {
792                family: TagFamily::AprilTag36h11,
793                id: 0,
794                tag_size,
795                canvas_size,
796                noise_sigma: noise,
797                brightness_offset: brightness,
798                contrast_scale: contrast,
799            };
800
801            let (data, _corners) = generate_test_image_with_params(&params);
802            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();
803
804            let engine = ThresholdEngine::new();
805            let arena = Bump::new();
806            let stats = engine.compute_tile_stats(&arena, &img);
807            let mut binary = vec![0u8; canvas_size * canvas_size];
808
809            // Should not panic
810            engine.apply_threshold(&arena, &img, &stats, &mut binary);
811
812            // Basic sanity: output should have both black and white pixels
813            let black_count = binary.iter().filter(|&&p| p == 0).count();
814            let white_count = binary.iter().filter(|&&p| p == 255).count();
815
816            // Valid binary image should have both colors (only 0 and 255)
817            prop_assert!(black_count + white_count == binary.len(),
818                "Binary output contains non-binary values");
819
820            // With a tag present, we expect some black and white pixels
821            prop_assert!(black_count > 0, "No black pixels in output");
822            prop_assert!(white_count > 0, "No white pixels in output");
823        }
824    }
825}
826
827#[multiversion(targets = "simd")]
828fn compute_min_max_simd(data: &[u8]) -> (u8, u8) {
829    let mut min = 255u8;
830    let mut max = 0u8;
831    for &b in data {
832        min = min.min(b);
833        max = max.max(b);
834    }
835    (min, max)
836}
837
838// =============================================================================
839// INTEGRAL IMAGE-BASED ADAPTIVE THRESHOLD
840// =============================================================================
841//
842// This implements OpenCV-style ADAPTIVE_THRESH_MEAN_C using integral images:
843// 1. Compute integral image in O(W*H)
844// 2. For each pixel, compute local mean in O(1) using integral image
845// 3. Threshold: pixel < (local_mean - C) ? black : white
846//
847// This produces per-pixel adaptive thresholds for small tag detection.
848
849/// Compute integral image (cumulative sum) for fast box filter computation.
850///
851/// Uses a 2-pass parallel implementation for maximum throughput on modern multicore CPUs.
852/// The `integral` buffer must have size `(img.width + 1) * (img.height + 1)`.
853#[allow(clippy::needless_range_loop, clippy::items_after_statements)]
854pub fn compute_integral_image(img: &ImageView, integral: &mut [u64]) {
855    let w = img.width;
856    let h = img.height;
857    let stride = w + 1;
858
859    // Zero the first row efficiently
860    for x in 0..stride {
861        integral[x] = 0;
862    }
863
864    // use rayon::prelude::*;
865
866    // 1st Pass: Compute horizontal cumulative sums (prefix sum per row)
867    // This part is perfectly parallel.
868    integral
869        .par_chunks_exact_mut(stride)
870        .enumerate()
871        .skip(1)
872        .for_each(|(y_idx, row)| {
873            let y = y_idx - 1;
874            let src_row = img.get_row(y);
875            let mut sum = 0u64;
876            // row[0] is already 0
877            for x in 0..w {
878                sum += u64::from(src_row[x]);
879                row[x + 1] = sum;
880            }
881        });
882
883    // 2nd Pass: Vertical cumulative sums
884    // For large images, we process in vertical blocks to stay in cache.
885    const BLOCK_SIZE: usize = 128;
886    let num_blocks = stride.div_ceil(BLOCK_SIZE);
887
888    (0..num_blocks).into_par_iter().for_each(|b| {
889        let start_x = b * BLOCK_SIZE;
890        let end_x = (start_x + BLOCK_SIZE).min(stride);
891
892        // Initialize cumulative sum for this column block
893        // We use a small on-stack or small-vec if needed, but since BLOCK_SIZE is small (128),
894        // we can just use a fixed-size array if we want to avoid allocation entirely.
895        let mut col_sums = [0u64; BLOCK_SIZE];
896
897        // Safety: We are writing to unique columns in each parallel task.
898        unsafe {
899            let base_ptr = integral.as_ptr().cast_mut();
900            for y in 1..=h {
901                let row_ptr = base_ptr.add(y * stride + start_x);
902                for (i, val) in col_sums.iter_mut().enumerate().take(end_x - start_x) {
903                    let old_val = *row_ptr.add(i);
904                    let new_sum = old_val + *val;
905                    *row_ptr.add(i) = new_sum;
906                    *val = new_sum;
907                }
908            }
909        }
910    });
911}
912
913/// Apply per-pixel adaptive threshold using integral image.
914///
915/// Optimized with parallel processing and branchless thresholding.
916#[multiversion(targets = "simd")]
917/// Apply per-pixel adaptive threshold using integral image.
918///
919/// Optimized with parallel processing, interior-loop vectorization, and fixed-point arithmetic.
920#[multiversion(targets(
921    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
922    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
923    "aarch64+neon"
924))]
925pub fn adaptive_threshold_integral(
926    img: &ImageView,
927    integral: &[u64],
928    output: &mut [u8],
929    radius: usize,
930    c: i16,
931) {
932    let w = img.width;
933    let h = img.height;
934    let stride = w + 1;
935
936    // use rayon::prelude::*;
937
938    // Precompute interior area inverse (fixed-point 1.31)
939    let side = (2 * radius + 1) as u32;
940    let area = side * side;
941    let inv_area_fixed = ((1u64 << 31) / u64::from(area)) as u32;
942
943    (0..h).into_par_iter().for_each(|y| {
944        let y_offset = y * w;
945        let src_row = img.get_row(y);
946
947        // Safety: Unique row per thread
948        let dst_row = unsafe {
949            let ptr = output.as_ptr().cast_mut();
950            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
951        };
952
953        let y0 = y.saturating_sub(radius);
954        let y1 = (y + radius + 1).min(h);
955
956        // Define interior region for this row
957        let x_start = radius;
958        let x_end = w.saturating_sub(radius + 1);
959
960        // 1. Process Left Border
961        for x in 0..x_start.min(w) {
962            let x0 = 0; // saturating_sub(radius) is 0
963            let x1 = (x + radius + 1).min(w);
964            let actual_area = (x1 - x0) * (y1 - y0);
965
966            let i00 = integral[y0 * stride + x0];
967            let i01 = integral[y0 * stride + x1];
968            let i10 = integral[y1 * stride + x0];
969            let i11 = integral[y1 * stride + x1];
970
971            let sum = (i11 + i00) - (i01 + i10);
972            let mean = (sum / actual_area as u64) as i16;
973            let threshold = (mean - c).max(0) as u8;
974            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
975        }
976
977        // 2. Process Interior (Vectorizable)
978        if x_end > x_start && y >= radius && y + radius < h {
979            let row00 = &integral[y0 * stride + (x_start - radius)..];
980            let row01 = &integral[y0 * stride + (x_start + radius + 1)..];
981            let row10 = &integral[y1 * stride + (x_start - radius)..];
982            let row11 = &integral[y1 * stride + (x_start + radius + 1)..];
983
984            let interior_src = &src_row[x_start..x_end];
985            let interior_dst = &mut dst_row[x_start..x_end];
986
987            for i in 0..(x_end - x_start) {
988                let sum = (row11[i] + row00[i]) - (row01[i] + row10[i]);
989                // Fixed-point division: (sum * inv_area) >> 31
990                let mean = ((sum * u64::from(inv_area_fixed)) >> 31) as i16;
991                let threshold = (mean - c).max(0) as u8;
992                interior_dst[i] = if interior_src[i] < threshold { 0 } else { 255 };
993            }
994        } else if x_end > x_start {
995            // Interior X but border Y
996            for x in x_start..x_end {
997                let x0 = x - radius;
998                let x1 = x + radius + 1;
999                let actual_area = (x1 - x0) * (y1 - y0);
1000
1001                let i00 = integral[y0 * stride + x0];
1002                let i01 = integral[y0 * stride + x1];
1003                let i10 = integral[y1 * stride + x0];
1004                let i11 = integral[y1 * stride + x1];
1005
1006                let sum = (i11 + i00) - (i01 + i10);
1007                let mean = (sum / actual_area as u64) as i16;
1008                let threshold = (mean - c).max(0) as u8;
1009                dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
1010            }
1011        }
1012
1013        // 3. Process Right Border
1014        for x in x_end.max(x_start)..w {
1015            let x0 = x.saturating_sub(radius);
1016            let x1 = w; // (x + radius + 1).min(w)
1017            let actual_area = (x1 - x0) * (y1 - y0);
1018
1019            let i00 = integral[y0 * stride + x0];
1020            let i01 = integral[y0 * stride + x1];
1021            let i10 = integral[y1 * stride + x0];
1022            let i11 = integral[y1 * stride + x1];
1023
1024            let sum = (i11 + i00) - (i01 + i10);
1025            let mean = (sum / actual_area as u64) as i16;
1026            let threshold = (mean - c).max(0) as u8;
1027            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
1028        }
1029    });
1030}
1031
1032/// Fast adaptive threshold combining integral image approach with SIMD.
1033///
1034/// This is the main entry point for performance-oriented adaptive thresholding:
1035/// - Computes integral image once
1036/// - Applies per-pixel adaptive threshold with local mean
1037/// - Uses default parameters tuned for AprilTag detection
1038pub fn apply_adaptive_threshold_fast(img: &ImageView, output: &mut [u8]) {
1039    // OpenCV uses blockSize=13 (radius=6) and C=3 as good defaults
1040    apply_adaptive_threshold_with_params(img, output, 6, 3);
1041}
1042
1043/// Adaptive threshold with custom parameters.
1044pub fn apply_adaptive_threshold_with_params(
1045    img: &ImageView,
1046    output: &mut [u8],
1047    radius: usize,
1048    c: i16,
1049) {
1050    let mut integral = vec![0u64; (img.width + 1) * (img.height + 1)];
1051    compute_integral_image(img, &mut integral);
1052    adaptive_threshold_integral(img, &integral, output, radius, c);
1053}
1054
1055/// Apply per-pixel adaptive threshold with gradient-based window sizing.
1056///
1057/// Highly optimized using Parallel processing, precomputed LUTs, and branchless logic.
1058#[allow(clippy::too_many_arguments)]
1059#[multiversion(targets(
1060    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
1061    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
1062    "aarch64+neon"
1063))]
1064pub fn adaptive_threshold_gradient_window(
1065    img: &ImageView,
1066    gradient_map: &[u8],
1067    integral: &[u64],
1068    output: &mut [u8],
1069    min_radius: usize,
1070    max_radius: usize,
1071    gradient_threshold: u8,
1072    c: i16,
1073) {
1074    let w = img.width;
1075    let h = img.height;
1076    let stride = w + 1;
1077
1078    // Precompute radius and area reciprocal LUTs (fixed-point 1.31)
1079    let mut radius_lut = [0usize; 256];
1080    let mut inv_area_lut = [0u32; 256];
1081    let grad_thresh_f32 = f32::from(gradient_threshold);
1082
1083    for g in 0..256 {
1084        let r = if g as u8 >= gradient_threshold {
1085            min_radius
1086        } else {
1087            let t = g as f32 / grad_thresh_f32;
1088            let r = max_radius as f32 * (1.0 - t) + min_radius as f32 * t;
1089            r as usize
1090        };
1091        radius_lut[g] = r;
1092        let side = (2 * r + 1) as u32;
1093        let area = side * side;
1094        inv_area_lut[g] = ((1u64 << 31) / u64::from(area)) as u32;
1095    }
1096
1097    // use rayon::prelude::*;
1098
1099    (0..h).into_par_iter().for_each(|y| {
1100        let y_offset = y * w;
1101        let src_row = img.get_row(y);
1102
1103        // Safety: Unique row per thread
1104        let dst_row = unsafe {
1105            let ptr = output.as_ptr().cast_mut();
1106            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
1107        };
1108
1109        for x in 0..w {
1110            let grad = gradient_map[y_offset + x];
1111            let radius = radius_lut[grad as usize];
1112
1113            let y0 = y.saturating_sub(radius);
1114            let y1 = (y + radius + 1).min(h);
1115            let x0 = x.saturating_sub(radius);
1116            let x1 = (x + radius + 1).min(w);
1117
1118            let i00 = integral[y0 * stride + x0];
1119            let i01 = integral[y0 * stride + x1];
1120            let i10 = integral[y1 * stride + x0];
1121            let i11 = integral[y1 * stride + x1];
1122
1123            let sum = (i11 + i00) - (i01 + i10);
1124
1125            // Fixed-point mean computation
1126            let mean = if x >= radius && x + radius < w && y >= radius && y + radius < h {
1127                ((sum * u64::from(inv_area_lut[grad as usize])) >> 31) as i16
1128            } else {
1129                let actual_area = (x1 - x0) * (y1 - y0);
1130                (sum / actual_area as u64) as i16
1131            };
1132
1133            let threshold = (mean - c).max(0) as u8;
1134            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
1135        }
1136    });
1137}
1138
1139/// Compute a map of local mean values.
1140///
1141/// Optimized with parallelism and vectorization.
1142#[multiversion(targets(
1143    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
1144    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
1145    "aarch64+neon"
1146))]
1147#[allow(clippy::cast_sign_loss, clippy::needless_range_loop)]
1148pub fn compute_threshold_map(
1149    img: &ImageView,
1150    integral: &[u64],
1151    output: &mut [u8],
1152    radius: usize,
1153    c: i16,
1154) {
1155    let w = img.width;
1156    let h = img.height;
1157    let stride = w + 1;
1158
1159    // use rayon::prelude::*; // Already imported at module level
1160
1161    // Precompute interior area inverse
1162    let side = (2 * radius + 1) as u32;
1163    let area = side * side;
1164    let inv_area_fixed = ((1u64 << 31) / u64::from(area)) as u32;
1165
1166    (0..h).into_par_iter().for_each(|y| {
1167        let y_offset = y * w;
1168
1169        // Safety: Unique row per thread
1170        let dst_row = unsafe {
1171            let ptr = output.as_ptr().cast_mut();
1172            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
1173        };
1174
1175        let y0 = y.saturating_sub(radius);
1176        let y1 = (y + radius + 1).min(h);
1177
1178        let x_start = radius;
1179        let x_end = w.saturating_sub(radius + 1);
1180
1181        // 1. Process Left Border
1182        for x in 0..x_start.min(w) {
1183            let x0 = 0;
1184            let x1 = (x + radius + 1).min(w);
1185            let actual_area = (x1 - x0) * (y1 - y0);
1186            let sum = (integral[y1 * stride + x1] + integral[y0 * stride + x0])
1187                - (integral[y0 * stride + x1] + integral[y1 * stride + x0]);
1188            let mean = (sum / actual_area as u64) as i16;
1189            dst_row[x] = (mean - c).clamp(0, 255) as u8;
1190        }
1191
1192        // 2. Process Interior (Vectorizable)
1193        if x_end > x_start && y >= radius && y + radius < h {
1194            let row00 = &integral[y0 * stride + (x_start - radius)..];
1195            let row01 = &integral[y0 * stride + (x_start + radius + 1)..];
1196            let row10 = &integral[y1 * stride + (x_start - radius)..];
1197            let row11 = &integral[y1 * stride + (x_start + radius + 1)..];
1198
1199            let interior_dst = &mut dst_row[x_start..x_end];
1200
1201            for i in 0..(x_end - x_start) {
1202                let sum = (row11[i] + row00[i]) - (row01[i] + row10[i]);
1203                let mean = ((sum * u64::from(inv_area_fixed)) >> 31) as i16;
1204                interior_dst[i] = (mean - c).clamp(0, 255) as u8;
1205            }
1206        }
1207
1208        // 3. Process Right Border
1209        for x in x_end.max(x_start)..w {
1210            let x0 = x.saturating_sub(radius);
1211            let x1 = w;
1212            let actual_area = (x1 - x0) * (y1 - y0);
1213            let sum = (integral[y1 * stride + x1] + integral[y0 * stride + x0])
1214                - (integral[y0 * stride + x1] + integral[y1 * stride + x0]);
1215            let mean = (sum / actual_area as u64) as i16;
1216            dst_row[x] = (mean - c).clamp(0, 255) as u8;
1217        }
1218    });
1219}