locus-core 0.4.0

//! Adaptive thresholding and image binarization.
//!
//! This module provides the first stage of the detection pipeline, converting grayscale
//! input into binary images while adapting to local lighting conditions.
//!
//! It features two primary implementations:
//! 1. **Tile-based Thresholding**: Blazing fast approach using local tile stats.
//! 2. **Integral Image Thresholding**: Per-pixel adaptive thresholding for small features.

#![allow(unsafe_code, clippy::cast_sign_loss)]
use crate::config::DetectorConfig;
use crate::image::ImageView;
use bumpalo::Bump;
use bumpalo::collections::Vec as BumpVec;
use multiversion::multiversion;
use rayon::prelude::*;

/// Statistics for a single threshold tile.
#[derive(Clone, Copy, Debug, Default)]
pub struct TileStats {
    /// Minimum pixel value in the tile.
    pub min: u8,
    /// Maximum pixel value in the tile.
    pub max: u8,
}

/// Adaptive thresholding engine using tile-based stats.
pub struct ThresholdEngine {
    /// Size of the tiles used for local thresholding statistics.
    pub tile_size: usize,
    /// Minimum intensity range for a tile to be considered valid.
    pub min_range: u8,
}

impl Default for ThresholdEngine {
    fn default() -> Self {
        Self::new()
    }
}

impl ThresholdEngine {
    /// Create a new ThresholdEngine with default settings.
    #[must_use]
    pub fn new() -> Self {
        Self {
            tile_size: 8,  // Standard 8x8 tiles
            min_range: 10, // Match DetectorConfig::default()
        }
    }

    /// Create a ThresholdEngine from detector configuration.
    #[must_use]
    pub fn from_config(config: &DetectorConfig) -> Self {
        Self {
            tile_size: config.threshold_tile_size,
            min_range: config.threshold_min_range,
        }
    }

    /// Compute min/max statistics for each tile in the image.
    /// Optimized with SIMD-friendly memory access patterns and subsampling (stride 2).
    #[must_use]
    #[tracing::instrument(skip_all, name = "pipeline::threshold_compute_stats")]
    pub fn compute_tile_stats<'a>(
        &self,
        arena: &'a Bump,
        img: &ImageView,
    ) -> BumpVec<'a, TileStats> {
        let ts = self.tile_size;
        let tiles_wide = img.width / ts;
        let tiles_high = img.height / ts;
        let mut stats = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
        stats.resize(tiles_wide * tiles_high, TileStats { min: 255, max: 0 });

        stats
            .par_chunks_mut(tiles_wide)
            .enumerate()
            .for_each(|(ty, stats_row)| {
                // Subsampling: Only process every other row within a tile (stride 2)
                // This statistically approximates the min/max sufficient for thresholding
                for dy in 0..ts {
                    let py = ty * ts + dy;
                    let src_row = img.get_row(py);

                    // Process all tiles in this row with SIMD-friendly min/max
                    compute_row_tile_stats_simd(src_row, stats_row, ts);
                }
            });
        stats
    }

    /// Apply adaptive thresholding to the image.
    /// Optimized with pre-expanded threshold maps and vectorized row processing.
    #[allow(clippy::too_many_lines, clippy::needless_range_loop)]
    #[allow(dead_code)]
    pub fn apply_threshold(
        &self,
        arena: &Bump,
        img: &ImageView,
        stats: &[TileStats],
        output: &mut [u8],
    ) {
        let ts = self.tile_size;
        let tiles_wide = img.width / ts;
        let tiles_high = img.height / ts;

        let mut tile_thresholds = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
        tile_thresholds.resize(tiles_wide * tiles_high, 0u8);
        let mut tile_valid = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
        tile_valid.resize(tiles_wide * tiles_high, 0u8);

        tile_thresholds
            .par_chunks_mut(tiles_wide)
            .enumerate()
            .for_each(|(ty, t_row)| {
                let y_start = ty.saturating_sub(1);
                let y_end = (ty + 1).min(tiles_high - 1);

                for tx in 0..tiles_wide {
                    let mut nmin = 255u8;
                    let mut nmax = 0u8;

                    let x_start = tx.saturating_sub(1);
                    let x_end = (tx + 1).min(tiles_wide - 1);

                    for ny in y_start..=y_end {
                        let row_off = ny * tiles_wide;
                        for nx in x_start..=x_end {
                            let s = stats[row_off + nx];
                            if s.min < nmin {
                                nmin = s.min;
                            }
                            if s.max > nmax {
                                nmax = s.max;
                            }
                        }
                    }

                    let t_idx = tx;
                    let res = ((u16::from(nmin) + u16::from(nmax)) >> 1) as u8;

                    t_row[t_idx] = res;
                }
            });

        // Compute tile_valid (can be done in same loop above or separate)
        for ty in 0..tiles_high {
            for tx in 0..tiles_wide {
                let mut nmin = 255;
                let mut nmax = 0;
                let y_start = ty.saturating_sub(1);
                let y_end = (ty + 1).min(tiles_high - 1);
                let x_start = tx.saturating_sub(1);
                let x_end = (tx + 1).min(tiles_wide - 1);

                for ny in y_start..=y_end {
                    let row_off = ny * tiles_wide;
                    for nx in x_start..=x_end {
                        let s = stats[row_off + nx];
                        if s.min < nmin {
                            nmin = s.min;
                        }
                        if s.max > nmax {
                            nmax = s.max;
                        }
                    }
                }
                let idx = ty * tiles_wide + tx;
                tile_valid[idx] = if nmax.saturating_sub(nmin) < self.min_range {
                    0
                } else {
                    255
                };
            }
        }

        // --- Propagation Pass ---
        // Fill thresholds for invalid tiles from their neighbors to stay
        // consistent within large uniform regions.
        for _ in 0..2 {
            // 2 iterations are usually enough for local consistency
            for ty in 0..tiles_high {
                for tx in 0..tiles_wide {
                    let idx = ty * tiles_wide + tx;
                    if tile_valid[idx] == 0 {
                        let mut sum_thresh = 0u32;
                        let mut count = 0u32;

                        let y_start = ty.saturating_sub(1);
                        let y_end = (ty + 1).min(tiles_high - 1);
                        let x_start = tx.saturating_sub(1);
                        let x_end = (tx + 1).min(tiles_wide - 1);

                        for ny in y_start..=y_end {
                            let row_off = ny * tiles_wide;
                            for nx in x_start..=x_end {
                                let n_idx = row_off + nx;
                                if tile_valid[n_idx] > 0 {
                                    sum_thresh += u32::from(tile_thresholds[n_idx]);
                                    count += 1;
                                }
                            }
                        }

                        if count > 0 {
                            tile_thresholds[idx] = (sum_thresh / count) as u8;
                            tile_valid[idx] = 128; // Partial valid (propagated)
                        }
                    }
                }
            }
        }

        let thresholds_slice = tile_thresholds.as_slice();
        let valid_slice = tile_valid.as_slice();

        output
            .par_chunks_mut(ts * img.width)
            .enumerate()
            .for_each_init(
                || (vec![0u8; img.width], vec![0u8; img.width]),
                |(row_thresholds, row_valid), (ty, output_tile_rows)| {
                    if ty >= tiles_high {
                        return;
                    }
                    // Zero-fill the trailing pixels beyond `tiles_wide * ts`: the
                    // per-tile loop below only writes indices `[0, tiles_wide * ts)`
                    // but `threshold_row_simd` reads the full `img.width`.
                    row_thresholds.fill(0);
                    row_valid.fill(0);

                    // Expand tile stats to row buffers
                    for tx in 0..tiles_wide {
                        let idx = ty * tiles_wide + tx;
                        let thresh = thresholds_slice[idx];
                        let valid = valid_slice[idx];
                        for i in 0..ts {
                            row_thresholds[tx * ts + i] = thresh;
                            row_valid[tx * ts + i] = valid;
                        }
                    }

                    for dy in 0..ts {
                        let py = ty * ts + dy;
                        let src_row = img.get_row(py);
                        let dst_row = &mut output_tile_rows[dy * img.width..(dy + 1) * img.width];

                        threshold_row_simd(src_row, dst_row, row_thresholds, row_valid);
                    }
                },
            );
    }

    /// Apply adaptive thresholding and return both binary and threshold maps.
    ///
    /// This is needed for threshold-model-aware segmentation, which uses the
    /// per-pixel threshold values to connect pixels by their deviation sign.
    #[allow(clippy::needless_range_loop)]
    #[tracing::instrument(skip_all, name = "pipeline::threshold_apply_map")]
    pub fn apply_threshold_with_map(
        &self,
        arena: &Bump,
        img: &ImageView,
        stats: &[TileStats],
        binary_output: &mut [u8],
        threshold_output: &mut [u8],
    ) {
        let ts = self.tile_size;
        let tiles_wide = img.width / ts;
        let tiles_high = img.height / ts;

        let mut tile_thresholds = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
        tile_thresholds.resize(tiles_wide * tiles_high, 0u8);
        let mut tile_valid = BumpVec::with_capacity_in(tiles_wide * tiles_high, arena);
        tile_valid.resize(tiles_wide * tiles_high, 0u8);

        tile_thresholds
            .par_chunks_mut(tiles_wide)
            .enumerate()
            .for_each(|(ty, t_row)| {
                let y_start = ty.saturating_sub(1);
                let y_end = (ty + 1).min(tiles_high - 1);

                for tx in 0..tiles_wide {
                    let mut nmin = 255u8;
                    let mut nmax = 0u8;

                    let x_start = tx.saturating_sub(1);
                    let x_end = (tx + 1).min(tiles_wide - 1);

                    for ny in y_start..=y_end {
                        let row_off = ny * tiles_wide;
                        for nx in x_start..=x_end {
                            let s = stats[row_off + nx];
                            if s.min < nmin {
                                nmin = s.min;
                            }
                            if s.max > nmax {
                                nmax = s.max;
                            }
                        }
                    }

                    t_row[tx] = ((u16::from(nmin) + u16::from(nmax)) >> 1) as u8;
                }
            });

        // Compute tile_valid
        for ty in 0..tiles_high {
            for tx in 0..tiles_wide {
                let mut nmin = 255;
                let mut nmax = 0;
                let y_start = ty.saturating_sub(1);
                let y_end = (ty + 1).min(tiles_high - 1);
                let x_start = tx.saturating_sub(1);
                let x_end = (tx + 1).min(tiles_wide - 1);

                for ny in y_start..=y_end {
                    let row_off = ny * tiles_wide;
                    for nx in x_start..=x_end {
                        let s = stats[row_off + nx];
                        if s.min < nmin {
                            nmin = s.min;
                        }
                        if s.max > nmax {
                            nmax = s.max;
                        }
                    }
                }
                let idx = ty * tiles_wide + tx;
                tile_valid[idx] = if nmax.saturating_sub(nmin) < self.min_range {
                    0
                } else {
                    255
                };
            }
        }

        // Propagation pass
        /*
        for _ in 0..2 {
            for ty in 0..tiles_high {
                for tx in 0..tiles_wide {
                    let idx = ty * tiles_wide + tx;
                    if tile_valid[idx] == 0 {
                        let mut sum_thresh = 0u32;
                        let mut count = 0u32;

                        let y_start = ty.saturating_sub(1);
                        let y_end = (ty + 1).min(tiles_high - 1);
                        let x_start = tx.saturating_sub(1);
                        let x_end = (tx + 1).min(tiles_wide - 1);

                        for ny in y_start..=y_end {
                            let row_off = ny * tiles_wide;
                            for nx in x_start..=x_end {
                                let n_idx = row_off + nx;
                                if tile_valid[n_idx] > 0 {
                                    sum_thresh += u32::from(tile_thresholds[n_idx]);
                                    count += 1;
                                }
                            }
                        }

                        if count > 0 {
                            tile_thresholds[idx] = (sum_thresh / count) as u8;
                            tile_valid[idx] = 128;
                        }
                    }
                }
            }
        }
        */

        // Write thresholds and binary output in parallel
        let thresholds_slice = tile_thresholds.as_slice();
        let valid_slice = tile_valid.as_slice();

        binary_output
            .par_chunks_mut(ts * img.width)
            .enumerate()
            .for_each_init(
                || (vec![0u8; img.width], vec![0u8; img.width]),
                |(row_thresholds, row_valid), (ty, bin_tile_rows)| {
                    if ty >= tiles_high {
                        return;
                    }
                    // SAFETY: `binary_output.par_chunks_mut(ts * img.width)`
                    // yields one `ty` per worker; the parallel `threshold_output`
                    // tile slice for the same `ty` is therefore disjoint from
                    // every other worker's write set. The `ty < tiles_high`
                    // guard above keeps `ty * ts * img.width + ts * img.width`
                    // within the original `threshold_output` length.
                    let thresh_tile_rows = unsafe {
                        let ptr = threshold_output.as_ptr().cast_mut();
                        std::slice::from_raw_parts_mut(ptr.add(ty * ts * img.width), ts * img.width)
                    };

                    row_thresholds.fill(0);
                    row_valid.fill(0);

                    for tx in 0..tiles_wide {
                        let idx = ty * tiles_wide + tx;
                        let thresh = thresholds_slice[idx];
                        let valid = valid_slice[idx];
                        for i in 0..ts {
                            row_thresholds[tx * ts + i] = thresh;
                            row_valid[tx * ts + i] = valid;
                        }
                    }

                    for dy in 0..ts {
                        let py = ty * ts + dy;
                        let src_row = img.get_row(py);

                        // Write binary output
                        let bin_row = &mut bin_tile_rows[dy * img.width..(dy + 1) * img.width];
                        threshold_row_simd(src_row, bin_row, row_thresholds, row_valid);

                        // Write threshold map
                        thresh_tile_rows[dy * img.width..(dy + 1) * img.width]
                            .copy_from_slice(row_thresholds);
                    }
                },
            );
    }
}

#[multiversion(targets(
    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
    "aarch64+neon"
))]
fn compute_row_tile_stats_simd(src_row: &[u8], stats: &mut [TileStats], tile_size: usize) {
    let chunks = src_row.chunks_exact(tile_size);
    for (chunk, stat) in chunks.zip(stats.iter_mut()) {
        let mut rmin = stat.min;
        let mut rmax = stat.max;

        // Hint to compiler for vectorization
        for &p in chunk {
            rmin = rmin.min(p);
            rmax = rmax.max(p);
        }

        stat.min = rmin;
        stat.max = rmax;
    }
}

#[multiversion(targets(
    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
    "aarch64+neon"
))]
fn threshold_row_simd(src: &[u8], dst: &mut [u8], thresholds: &[u8], valid_mask: &[u8]) {
    let len = src.len();
    // Use chunks_exact to help compiler vectorize (e.g. into 16 or 32-byte chunks)
    let src_chunks = src.chunks_exact(16);
    let dst_chunks = dst.chunks_exact_mut(16);
    let thresh_chunks = thresholds.chunks_exact(16);
    let mask_chunks = valid_mask.chunks_exact(16);

    for (((s_c, d_c), t_c), m_c) in src_chunks
        .zip(dst_chunks)
        .zip(thresh_chunks)
        .zip(mask_chunks)
    {
        for i in 0..16 {
            let s = s_c[i];
            let t = t_c[i];
            let v = m_c[i];
            // Nested if structure often helps compiler with CMV/Masks
            d_c[i] = if v > 0 {
                if s < t { 0 } else { 255 }
            } else {
                255
            };
        }
    }

    // Handle tail
    let processed = (len / 16) * 16;
    for i in processed..len {
        let s = src[i];
        let t = thresholds[i];
        let v = valid_mask[i];
        dst[i] = if v > 0 {
            if s < t { 0 } else { 255 }
        } else {
            255
        };
    }
}

#[cfg(test)]
#[allow(
    clippy::unwrap_used,
    clippy::expect_used,
    clippy::naive_bytecount,
    clippy::cast_possible_truncation,
    clippy::cast_sign_loss
)]
mod tests {
    use super::*;
    use proptest::prelude::*;

    proptest! {
        #[test]
        fn test_threshold_invariants(data in prop::collection::vec(0..=255u8, 16)) {
            let mut min = 255u8;
            let mut max = 0u8;
            for &b in &data {
                if b < min { min = b; }
                if b > max { max = b; }
            }
            let (rmin, rmax) = compute_min_max_simd(&data);
            assert_eq!(rmin, min);
            assert_eq!(rmax, max);
        }

        #[test]
        fn test_binarization_invariants(src in prop::collection::vec(0..=255u8, 16), thresh in 0..=255u8) {
            let mut dst = vec![0u8; 16];
            let valid = vec![255u8; 16];
            threshold_row_simd(&src, &mut dst, &[thresh; 16], &valid);

            for (i, &s) in src.iter().enumerate() {
                if s >= thresh {
                    assert_eq!(dst[i], 255);
                } else {
                    assert_eq!(dst[i], 0);
                }
            }

            // Test invalid tile (mask=0) should be white (255)
            let invalid = vec![0u8; 16];
            threshold_row_simd(&src, &mut dst, &[thresh; 16], &invalid);
            for d in dst {
                assert_eq!(d, 255);
            }
        }
    }

    #[test]
    fn test_threshold_engine_e2e() {
        let width = 16;
        let height = 16;
        let mut data = vec![128u8; width * height];
        // Draw a black square in a white background area to test adaptive threshold
        for y in 4..12 {
            for x in 4..12 {
                data[y * width + x] = 50;
            }
        }
        for y in 0..height {
            for x in 0..width {
                if !(2..=14).contains(&x) || !(2..=14).contains(&y) {
                    data[y * width + x] = 200;
                }
            }
        }

        let img = ImageView::new(&data, width, height, width).unwrap();
        let engine = ThresholdEngine::new();
        let arena = Bump::new();
        let stats = engine.compute_tile_stats(&arena, &img);
        let mut output = vec![0u8; width * height];
        engine.apply_threshold(&arena, &img, &stats, &mut output);

        // At (8,8), it should be black (0) because it's 50 and thresh should be around (50+200)/2 = 125
        assert_eq!(output[8 * width + 8], 0);
        // At (1,1), it should be white (255)
        assert_eq!(output[width + 1], 255);
    }

    #[test]
    fn test_threshold_with_decimation() {
        let width = 32;
        let height = 32;
        let mut data = vec![200u8; width * height];
        // Draw a black square (16x16) in the center
        for y in 8..24 {
            for x in 8..24 {
                data[y * width + x] = 50;
            }
        }

        let img = ImageView::new(&data, width, height, width).unwrap();

        // Decimate by 2 -> 16x16
        let mut decimated_data = vec![0u8; 16 * 16];
        let decimated_img = img
            .decimate_to(2, &mut decimated_data)
            .expect("decimation failed");

        assert_eq!(decimated_img.width, 16);
        assert_eq!(decimated_img.height, 16);
        let arena = Bump::new();
        let engine = ThresholdEngine::new();
        let stats = engine.compute_tile_stats(&arena, &decimated_img);
        let mut output = vec![0u8; 16 * 16];
        engine.apply_threshold(&arena, &decimated_img, &stats, &mut output);

        // At (4,4) in decimated image (which is 8,8 in original), it should be black (0)
        assert_eq!(output[4 * 16 + 4], 0);
        // At (0,0) in decimated image, it should be white (255)
        assert_eq!(output[0], 255);
    }

    // ========================================================================
    // THRESHOLD ROBUSTNESS TESTS
    // ========================================================================

    use crate::config::TagFamily;
    use crate::test_utils::{
        TestImageParams, generate_test_image_with_params, measure_border_integrity,
    };

    /// Test threshold preserves tag structure at varying sizes (distance proxy).
    /// Note: AprilTag 36h11 has 8x8 cells, so 4px/bit = 32px minimum.
    #[test]
    fn test_threshold_preserves_tag_structure_at_varying_sizes() {
        let canvas_size = 640;
        // Minimum 32px for 4 pixels per bit (AprilTag 36h11 = 8x8 cells)
        let tag_sizes = [32, 48, 64, 100, 150, 200, 300];

        for tag_size in tag_sizes {
            let params = TestImageParams {
                family: TagFamily::AprilTag36h11,
                id: 0,
                tag_size,
                canvas_size,
                ..Default::default()
            };

            let (data, corners) = generate_test_image_with_params(&params);
            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();

            let engine = ThresholdEngine::new();
            let arena = Bump::new();
            let stats = engine.compute_tile_stats(&arena, &img);
            let mut binary = vec![0u8; canvas_size * canvas_size];
            engine.apply_threshold(&arena, &img, &stats, &mut binary);

            let integrity = measure_border_integrity(&binary, canvas_size, &corners);

            // For tags >= 32px (4px/bit), we expect good binarization (>50% border detected)
            assert!(
                integrity > 0.5,
                "Tag size {} failed: border integrity = {:.2}% (expected >50%)",
                tag_size,
                integrity * 100.0
            );

            println!(
                "Tag size {:>3}px: border integrity = {:.1}%",
                tag_size,
                integrity * 100.0
            );
        }
    }

    /// Test threshold robustness to brightness and contrast variations.
    #[test]
    fn test_threshold_robustness_brightness_contrast() {
        let canvas_size = 320;
        let tag_size = 120;
        let brightness_offsets = [-50, -25, 0, 25, 50];
        let contrast_scales = [0.50, 0.75, 1.0, 1.25, 1.50];

        for &brightness in &brightness_offsets {
            for &contrast in &contrast_scales {
                let params = TestImageParams {
                    family: TagFamily::AprilTag36h11,
                    id: 0,
                    tag_size,
                    canvas_size,
                    brightness_offset: brightness,
                    contrast_scale: contrast,
                    ..Default::default()
                };

                let (data, corners) = generate_test_image_with_params(&params);
                let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();

                let engine = ThresholdEngine::new();
                let arena = Bump::new();
                let stats = engine.compute_tile_stats(&arena, &img);
                let mut binary = vec![0u8; canvas_size * canvas_size];
                engine.apply_threshold(&arena, &img, &stats, &mut binary);

                let integrity = measure_border_integrity(&binary, canvas_size, &corners);

                // For moderate conditions, expect good integrity
                let is_moderate = brightness.abs() <= 25 && contrast >= 0.75;
                if is_moderate {
                    assert!(
                        integrity > 0.4,
                        "Brightness {}, Contrast {:.2}: integrity {:.1}% too low",
                        brightness,
                        contrast,
                        integrity * 100.0
                    );
                }

                println!(
                    "Brightness {:>3}, Contrast {:.2}: integrity = {:.1}%",
                    brightness,
                    contrast,
                    integrity * 100.0
                );
            }
        }
    }

    /// Test threshold robustness to varying noise levels.
    #[test]
    fn test_threshold_robustness_noise() {
        let canvas_size = 320;
        let tag_size = 120;
        let noise_levels = [0.0, 5.0, 10.0, 15.0, 20.0, 30.0];

        for &noise_sigma in &noise_levels {
            let params = TestImageParams {
                family: TagFamily::AprilTag36h11,
                id: 0,
                tag_size,
                canvas_size,
                noise_sigma,
                ..Default::default()
            };

            let (data, corners) = generate_test_image_with_params(&params);
            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();

            let engine = ThresholdEngine::new();
            let arena = Bump::new();
            let stats = engine.compute_tile_stats(&arena, &img);
            let mut binary = vec![0u8; canvas_size * canvas_size];
            engine.apply_threshold(&arena, &img, &stats, &mut binary);

            let integrity = measure_border_integrity(&binary, canvas_size, &corners);

            // For noise <= 15, expect reasonable integrity
            if noise_sigma <= 15.0 {
                assert!(
                    integrity > 0.45,
                    "Noise σ={:.1}: integrity {:.1}% too low",
                    noise_sigma,
                    integrity * 100.0
                );
            }

            println!(
                "Noise σ={:>4.1}: integrity = {:.1}%",
                noise_sigma,
                integrity * 100.0
            );
        }
    }

    proptest! {
        /// Fuzz test threshold with random combinations of parameters.
        /// Minimum tag size 32px to ensure 4 pixels per bit.
        #[test]
        fn test_threshold_combined_conditions_no_panic(
            tag_size in 32_usize..200,  // 32px = 4px/bit for AprilTag 36h11
            brightness in -40_i16..40,
            contrast in 0.6_f32..1.4,
            noise in 0.0_f32..20.0
        ) {
            let canvas_size = 320;

            // Skip invalid combinations (tag too big for canvas)
            if tag_size >= canvas_size - 40 {
                return Ok(());
            }

            let params = TestImageParams {
                family: TagFamily::AprilTag36h11,
                id: 0,
                tag_size,
                canvas_size,
                noise_sigma: noise,
                brightness_offset: brightness,
                contrast_scale: contrast,
            };

            let (data, _corners) = generate_test_image_with_params(&params);
            let img = ImageView::new(&data, canvas_size, canvas_size, canvas_size).unwrap();

            let engine = ThresholdEngine::new();
            let arena = Bump::new();
            let stats = engine.compute_tile_stats(&arena, &img);
            let mut binary = vec![0u8; canvas_size * canvas_size];

            // Should not panic
            engine.apply_threshold(&arena, &img, &stats, &mut binary);

            // Basic sanity: output should have both black and white pixels
            let black_count = binary.iter().filter(|&&p| p == 0).count();
            let white_count = binary.iter().filter(|&&p| p == 255).count();

            // Valid binary image should have both colors (only 0 and 255)
            prop_assert!(black_count + white_count == binary.len(),
                "Binary output contains non-binary values");

            // With a tag present, we expect some black and white pixels
            prop_assert!(black_count > 0, "No black pixels in output");
            prop_assert!(white_count > 0, "No white pixels in output");
        }
    }
}

#[multiversion(targets = "simd")]
fn compute_min_max_simd(data: &[u8]) -> (u8, u8) {
    let mut min = 255u8;
    let mut max = 0u8;
    for &b in data {
        min = min.min(b);
        max = max.max(b);
    }
    (min, max)
}

// =============================================================================
// INTEGRAL IMAGE-BASED ADAPTIVE THRESHOLD
// =============================================================================
//
// This implements OpenCV-style ADAPTIVE_THRESH_MEAN_C using integral images:
// 1. Compute integral image in O(W*H)
// 2. For each pixel, compute local mean in O(1) using integral image
// 3. Threshold: pixel < (local_mean - C) ? black : white
//
// This produces per-pixel adaptive thresholds for small tag detection.

/// Compute integral image (cumulative sum) for fast box filter computation.
///
/// Uses a 2-pass parallel implementation for maximum throughput on modern multicore CPUs.
/// The `integral` buffer must have size `(img.width + 1) * (img.height + 1)`.
#[allow(clippy::needless_range_loop, clippy::items_after_statements)]
#[allow(dead_code)]
pub fn compute_integral_image(img: &ImageView, integral: &mut [u64]) {
    let w = img.width;
    let h = img.height;
    let stride = w + 1;

    // Zero the first row efficiently
    for x in 0..stride {
        integral[x] = 0;
    }

    // use rayon::prelude::*;

    // 1st Pass: Compute horizontal cumulative sums (prefix sum per row)
    // This part is perfectly parallel.
    integral
        .par_chunks_exact_mut(stride)
        .enumerate()
        .skip(1)
        .for_each(|(y_idx, row)| {
            let y = y_idx - 1;
            let src_row = img.get_row(y);
            let mut sum = 0u64;
            // row[0] is already 0
            for x in 0..w {
                sum += u64::from(src_row[x]);
                row[x + 1] = sum;
            }
        });

    // 2nd Pass: Vertical cumulative sums
    // For large images, we process in vertical blocks to stay in cache.
    const BLOCK_SIZE: usize = 128;
    let num_blocks = stride.div_ceil(BLOCK_SIZE);

    (0..num_blocks).into_par_iter().for_each(|b| {
        let start_x = b * BLOCK_SIZE;
        let end_x = (start_x + BLOCK_SIZE).min(stride);

        // Initialize cumulative sum for this column block
        // We use a small on-stack or small-vec if needed, but since BLOCK_SIZE is small (128),
        // we can just use a fixed-size array if we want to avoid allocation entirely.
        let mut col_sums = [0u64; BLOCK_SIZE];

        // SAFETY: `(0..num_blocks).into_par_iter()` partitions the column
        // range into BLOCK_SIZE-wide stripes; each rayon worker handles one
        // `b`, so the `[start_x, end_x)` column window for a given `b` is
        // disjoint from every other worker's window. `integral` length is
        // `(h + 1) * stride`, so `y * stride + start_x + i` for
        // `y ∈ [1, h], i < end_x - start_x` stays in-bounds. The outer
        // `par_iter` borrows `integral` mutably for its full lifetime, so
        // no other reader exists.
        unsafe {
            let base_ptr = integral.as_ptr().cast_mut();
            for y in 1..=h {
                let row_ptr = base_ptr.add(y * stride + start_x);
                for (i, val) in col_sums.iter_mut().enumerate().take(end_x - start_x) {
                    let old_val = *row_ptr.add(i);
                    let new_sum = old_val + *val;
                    *row_ptr.add(i) = new_sum;
                    *val = new_sum;
                }
            }
        }
    });
}

/// Apply per-pixel adaptive threshold using integral image.
///
/// Optimized with parallel processing and branchless thresholding.
#[multiversion(targets = "simd")]
/// Apply per-pixel adaptive threshold using integral image.
///
/// Optimized with parallel processing, interior-loop vectorization, and fixed-point arithmetic.
#[multiversion(targets(
    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
    "aarch64+neon"
))]
#[tracing::instrument(skip_all, name = "pipeline::threshold_integral")]
pub fn adaptive_threshold_integral(
    img: &ImageView,
    integral: &[u64],
    output: &mut [u8],
    radius: usize,
    c: i16,
) {
    let w = img.width;
    let h = img.height;
    let stride = w + 1;

    // use rayon::prelude::*;

    // Precompute interior area inverse (fixed-point 1.31)
    let side = (2 * radius + 1) as u32;
    let area = side * side;
    let inv_area_fixed = ((1u64 << 31) / u64::from(area)) as u32;

    (0..h).into_par_iter().for_each(|y| {
        let y_offset = y * w;
        let src_row = img.get_row(y);

        // SAFETY: `(0..h).into_par_iter()` yields each `y` exactly once
        // across rayon workers, so the `[y * w, y * w + w)` slice is
        // disjoint from every other worker's slice. `output` length is
        // `h * w`, so the slice is in-bounds. The outer `par_iter` borrows
        // `output` mutably for its lifetime, so no concurrent reader exists.
        let dst_row = unsafe {
            let ptr = output.as_ptr().cast_mut();
            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
        };

        let y0 = y.saturating_sub(radius);
        let y1 = (y + radius + 1).min(h);

        // Define interior region for this row
        let x_start = radius;
        let x_end = w.saturating_sub(radius + 1);

        // 1. Process Left Border
        for x in 0..x_start.min(w) {
            let x0 = 0; // saturating_sub(radius) is 0
            let x1 = (x + radius + 1).min(w);
            let actual_area = (x1 - x0) * (y1 - y0);

            let i00 = integral[y0 * stride + x0];
            let i01 = integral[y0 * stride + x1];
            let i10 = integral[y1 * stride + x0];
            let i11 = integral[y1 * stride + x1];

            let sum = (i11 + i00) - (i01 + i10);
            let mean = (sum / actual_area as u64) as i16;
            let threshold = (mean - c).max(0) as u8;
            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
        }

        // 2. Process Interior (Vectorizable)
        if x_end > x_start && y >= radius && y + radius < h {
            let row00 = &integral[y0 * stride + (x_start - radius)..];
            let row01 = &integral[y0 * stride + (x_start + radius + 1)..];
            let row10 = &integral[y1 * stride + (x_start - radius)..];
            let row11 = &integral[y1 * stride + (x_start + radius + 1)..];

            let interior_src = &src_row[x_start..x_end];
            let interior_dst = &mut dst_row[x_start..x_end];

            for i in 0..(x_end - x_start) {
                let sum = (row11[i] + row00[i]) - (row01[i] + row10[i]);
                // Fixed-point division: (sum * inv_area) >> 31
                let mean = ((sum * u64::from(inv_area_fixed)) >> 31) as i16;
                let threshold = (mean - c).max(0) as u8;
                interior_dst[i] = if interior_src[i] < threshold { 0 } else { 255 };
            }
        } else if x_end > x_start {
            // Interior X but border Y
            for x in x_start..x_end {
                let x0 = x - radius;
                let x1 = x + radius + 1;
                let actual_area = (x1 - x0) * (y1 - y0);

                let i00 = integral[y0 * stride + x0];
                let i01 = integral[y0 * stride + x1];
                let i10 = integral[y1 * stride + x0];
                let i11 = integral[y1 * stride + x1];

                let sum = (i11 + i00) - (i01 + i10);
                let mean = (sum / actual_area as u64) as i16;
                let threshold = (mean - c).max(0) as u8;
                dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
            }
        }

        // 3. Process Right Border
        for x in x_end.max(x_start)..w {
            let x0 = x.saturating_sub(radius);
            let x1 = w; // (x + radius + 1).min(w)
            let actual_area = (x1 - x0) * (y1 - y0);

            let i00 = integral[y0 * stride + x0];
            let i01 = integral[y0 * stride + x1];
            let i10 = integral[y1 * stride + x0];
            let i11 = integral[y1 * stride + x1];

            let sum = (i11 + i00) - (i01 + i10);
            let mean = (sum / actual_area as u64) as i16;
            let threshold = (mean - c).max(0) as u8;
            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
        }
    });
}

/// Fast adaptive threshold combining integral image approach with SIMD.
///
/// This is the main entry point for performance-oriented adaptive thresholding:
/// - Computes integral image once
/// - Applies per-pixel adaptive threshold with local mean
/// - Uses default parameters tuned for AprilTag detection
#[allow(dead_code)]
pub(crate) fn apply_adaptive_threshold_fast(img: &ImageView, output: &mut [u8]) {
    // OpenCV uses blockSize=13 (radius=6) and C=3 as good defaults
    apply_adaptive_threshold_with_params(img, output, 6, 3);
}

/// Adaptive threshold with custom parameters.
#[allow(dead_code)]
pub(crate) fn apply_adaptive_threshold_with_params(
    img: &ImageView,
    output: &mut [u8],
    radius: usize,
    c: i16,
) {
    let mut integral = vec![0u64; (img.width + 1) * (img.height + 1)];
    compute_integral_image(img, &mut integral);
    adaptive_threshold_integral(img, &integral, output, radius, c);
}

/// Apply per-pixel adaptive threshold with gradient-based window sizing.
///
/// Highly optimized using Parallel processing, precomputed LUTs, and branchless logic.
#[allow(clippy::too_many_arguments)]
#[multiversion(targets(
    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
    "aarch64+neon"
))]
#[tracing::instrument(skip_all, name = "pipeline::threshold_gradient_window")]
pub fn adaptive_threshold_gradient_window(
    img: &ImageView,
    gradient_map: &[u8],
    integral: &[u64],
    output: &mut [u8],
    min_radius: usize,
    max_radius: usize,
    gradient_threshold: u8,
    c: i16,
) {
    let w = img.width;
    let h = img.height;
    let stride = w + 1;

    // Precompute radius and area reciprocal LUTs (fixed-point 1.31)
    let mut radius_lut = [0usize; 256];
    let mut inv_area_lut = [0u32; 256];
    let grad_thresh_f32 = f32::from(gradient_threshold);

    for g in 0..256 {
        let r = if g as u8 >= gradient_threshold {
            min_radius
        } else {
            let t = g as f32 / grad_thresh_f32;
            let r = max_radius as f32 * (1.0 - t) + min_radius as f32 * t;
            r as usize
        };
        radius_lut[g] = r;
        let side = (2 * r + 1) as u32;
        let area = side * side;
        inv_area_lut[g] = ((1u64 << 31) / u64::from(area)) as u32;
    }

    // use rayon::prelude::*;

    (0..h).into_par_iter().for_each(|y| {
        let y_offset = y * w;
        let src_row = img.get_row(y);

        // SAFETY: `(0..h).into_par_iter()` yields each `y` exactly once
        // across rayon workers, so the `[y * w, y * w + w)` slice is
        // disjoint from every other worker's slice. `output` length is
        // `h * w`, so the slice is in-bounds. The outer `par_iter` borrows
        // `output` mutably for its lifetime, so no concurrent reader exists.
        let dst_row = unsafe {
            let ptr = output.as_ptr().cast_mut();
            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
        };

        for x in 0..w {
            let grad = gradient_map[y_offset + x];
            let radius = radius_lut[grad as usize];

            let y0 = y.saturating_sub(radius);
            let y1 = (y + radius + 1).min(h);
            let x0 = x.saturating_sub(radius);
            let x1 = (x + radius + 1).min(w);

            let i00 = integral[y0 * stride + x0];
            let i01 = integral[y0 * stride + x1];
            let i10 = integral[y1 * stride + x0];
            let i11 = integral[y1 * stride + x1];

            let sum = (i11 + i00) - (i01 + i10);

            // Fixed-point mean computation
            let mean = if x >= radius && x + radius < w && y >= radius && y + radius < h {
                ((sum * u64::from(inv_area_lut[grad as usize])) >> 31) as i16
            } else {
                let actual_area = (x1 - x0) * (y1 - y0);
                (sum / actual_area as u64) as i16
            };

            let threshold = (mean - c).max(0) as u8;
            dst_row[x] = if src_row[x] < threshold { 0 } else { 255 };
        }
    });
}

/// Compute a map of local mean values.
///
/// Optimized with parallelism and vectorization.
#[multiversion(targets(
    "x86_64+avx2+bmi1+bmi2+popcnt+lzcnt",
    "x86_64+avx512f+avx512bw+avx512dq+avx512vl",
    "aarch64+neon"
))]
#[allow(clippy::cast_sign_loss, clippy::needless_range_loop)]
pub(crate) fn compute_threshold_map(
    img: &ImageView,
    integral: &[u64],
    output: &mut [u8],
    radius: usize,
    c: i16,
) {
    let w = img.width;
    let h = img.height;
    let stride = w + 1;

    // use rayon::prelude::*; // Already imported at module level

    // Precompute interior area inverse
    let side = (2 * radius + 1) as u32;
    let area = side * side;
    let inv_area_fixed = ((1u64 << 31) / u64::from(area)) as u32;

    (0..h).into_par_iter().for_each(|y| {
        let y_offset = y * w;

        // SAFETY: `(0..h).into_par_iter()` yields each `y` exactly once
        // across rayon workers, so the `[y * w, y * w + w)` slice is
        // disjoint from every other worker's slice. `output` length is
        // `h * w`, so the slice is in-bounds. The outer `par_iter` borrows
        // `output` mutably for its lifetime, so no concurrent reader exists.
        let dst_row = unsafe {
            let ptr = output.as_ptr().cast_mut();
            std::slice::from_raw_parts_mut(ptr.add(y_offset), w)
        };

        let y0 = y.saturating_sub(radius);
        let y1 = (y + radius + 1).min(h);

        let x_start = radius;
        let x_end = w.saturating_sub(radius + 1);

        // 1. Process Left Border
        for x in 0..x_start.min(w) {
            let x0 = 0;
            let x1 = (x + radius + 1).min(w);
            let actual_area = (x1 - x0) * (y1 - y0);
            let sum = (integral[y1 * stride + x1] + integral[y0 * stride + x0])
                - (integral[y0 * stride + x1] + integral[y1 * stride + x0]);
            let mean = (sum / actual_area as u64) as i16;
            dst_row[x] = (mean - c).clamp(0, 255) as u8;
        }

        // 2. Process Interior (Vectorizable)
        if x_end > x_start && y >= radius && y + radius < h {
            let row00 = &integral[y0 * stride + (x_start - radius)..];
            let row01 = &integral[y0 * stride + (x_start + radius + 1)..];
            let row10 = &integral[y1 * stride + (x_start - radius)..];
            let row11 = &integral[y1 * stride + (x_start + radius + 1)..];

            let interior_dst = &mut dst_row[x_start..x_end];

            for i in 0..(x_end - x_start) {
                let sum = (row11[i] + row00[i]) - (row01[i] + row10[i]);
                let mean = ((sum * u64::from(inv_area_fixed)) >> 31) as i16;
                interior_dst[i] = (mean - c).clamp(0, 255) as u8;
            }
        }

        // 3. Process Right Border
        for x in x_end.max(x_start)..w {
            let x0 = x.saturating_sub(radius);
            let x1 = w;
            let actual_area = (x1 - x0) * (y1 - y0);
            let sum = (integral[y1 * stride + x1] + integral[y0 * stride + x0])
                - (integral[y0 * stride + x1] + integral[y1 * stride + x0]);
            let mean = (sum / actual_area as u64) as i16;
            dst_row[x] = (mean - c).clamp(0, 255) as u8;
        }
    });
}