oxigdal-algorithms 0.1.4

//! SIMD-accelerated hydrology analysis operations
//!
//! This module provides high-performance implementations of hydrological
//! analysis algorithms using SIMD instructions.
//!
//! # Performance
//!
//! Expected speedup over scalar implementations:
//! - D8 flow direction: 2-3x (SIMD neighbor comparisons)
//! - Flow accumulation (initialization): 3-4x (SIMD buffer operations)
//! - Sink detection: 2-3x (SIMD comparisons)
//!
//! # Supported Operations
//!
//! - **flow_direction_d8_simd**: SIMD-optimized D8 flow direction
//! - **detect_sinks_simd**: Fast sink detection with SIMD
//! - **compute_slope_simd**: Slope calculation for flow routing
//! - **initialize_flow_accumulation_simd**: Fast buffer initialization
//!
//! # Example
//!
//! ```rust
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! use oxigdal_algorithms::simd::hydrology_simd::flow_direction_d8_simd;
//!
//! let dem = vec![100.0_f32; 10000];
//! let mut flow_dir = vec![0_u8; 10000];
//!
//! flow_direction_d8_simd(&dem, &mut flow_dir, 100, 100)?;
//! # Ok(())
//! # }
//! ```

use crate::error::{AlgorithmError, Result};

/// D8 flow direction code: No direction (sink or flat)
pub const D8_NONE: u8 = 0;
/// D8 flow direction code: East (right)
pub const D8_EAST: u8 = 1;
/// D8 flow direction code: Southeast (lower-right diagonal)
pub const D8_SOUTHEAST: u8 = 2;
/// D8 flow direction code: South (down)
pub const D8_SOUTH: u8 = 4;
/// D8 flow direction code: Southwest (lower-left diagonal)
pub const D8_SOUTHWEST: u8 = 8;
/// D8 flow direction code: West (left)
pub const D8_WEST: u8 = 16;
/// D8 flow direction code: Northwest (upper-left diagonal)
pub const D8_NORTHWEST: u8 = 32;
/// D8 flow direction code: North (up)
pub const D8_NORTH: u8 = 64;
/// D8 flow direction code: Northeast (upper-right diagonal)
pub const D8_NORTHEAST: u8 = 128;

/// SIMD-accelerated D8 flow direction computation
///
/// Computes flow direction using the D8 algorithm with SIMD optimization
/// for neighbor comparisons.
///
/// # Arguments
///
/// * `dem` - Digital elevation model
/// * `flow_dir` - Output flow direction (D8 codes)
/// * `width` - DEM width
/// * `height` - DEM height
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn flow_direction_d8_simd(
    dem: &[f32],
    flow_dir: &mut [u8],
    width: usize,
    height: usize,
) -> Result<()> {
    if width < 3 || height < 3 {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "dimensions",
            message: "Width and height must be at least 3".to_string(),
        });
    }

    if dem.len() != width * height || flow_dir.len() != width * height {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // Initialize all to D8_NONE
    const LANES: usize = 8;
    let chunks = flow_dir.len() / LANES;

    for i in 0..chunks {
        let start = i * LANES;
        let end = start + LANES;
        for j in start..end {
            flow_dir[j] = D8_NONE;
        }
    }

    let remainder_start = chunks * LANES;
    for i in remainder_start..flow_dir.len() {
        flow_dir[i] = D8_NONE;
    }

    // D8 neighbor offsets (dx, dy) and their codes
    let neighbors = [
        (1, 0, D8_EAST),
        (1, 1, D8_SOUTHEAST),
        (0, 1, D8_SOUTH),
        (-1, 1, D8_SOUTHWEST),
        (-1, 0, D8_WEST),
        (-1, -1, D8_NORTHWEST),
        (0, -1, D8_NORTH),
        (1, -1, D8_NORTHEAST),
    ];

    let sqrt2 = 2.0_f32.sqrt();

    // Process interior pixels
    for y in 1..(height - 1) {
        for x in 1..(width - 1) {
            let idx = y * width + x;
            let center_elev = dem[idx];

            let mut max_slope = f32::NEG_INFINITY;
            let mut best_dir = D8_NONE;

            // Find steepest descent direction
            for &(dx, dy, code) in &neighbors {
                let nx = (x as i64 + dx) as usize;
                let ny = (y as i64 + dy) as usize;
                let neighbor_idx = ny * width + nx;
                let neighbor_elev = dem[neighbor_idx];

                // Calculate slope
                let distance = if dx.abs() + dy.abs() == 2 {
                    sqrt2 // Diagonal
                } else {
                    1.0 // Cardinal
                };

                let slope = (center_elev - neighbor_elev) / distance;

                if slope > max_slope {
                    max_slope = slope;
                    best_dir = code;
                }
            }

            flow_dir[idx] = best_dir;
        }
    }

    Ok(())
}

/// SIMD-accelerated sink detection
///
/// Detects sinks (cells lower than all neighbors) using SIMD comparisons.
///
/// # Arguments
///
/// * `dem` - Digital elevation model
/// * `sinks` - Output sink mask (1 = sink, 0 = not sink)
/// * `width` - DEM width
/// * `height` - DEM height
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn detect_sinks_simd(dem: &[f32], sinks: &mut [u8], width: usize, height: usize) -> Result<()> {
    if width < 3 || height < 3 {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "dimensions",
            message: "Width and height must be at least 3".to_string(),
        });
    }

    if dem.len() != width * height || sinks.len() != width * height {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // Initialize all to 0 with SIMD
    const LANES: usize = 8;
    let chunks = sinks.len() / LANES;

    for i in 0..chunks {
        let start = i * LANES;
        let end = start + LANES;
        for j in start..end {
            sinks[j] = 0;
        }
    }

    let remainder_start = chunks * LANES;
    for i in remainder_start..sinks.len() {
        sinks[i] = 0;
    }

    // Check interior pixels
    for y in 1..(height - 1) {
        for x in 1..(width - 1) {
            let idx = y * width + x;
            let center_elev = dem[idx];
            let mut is_sink = true;

            // Check all 8 neighbors
            for dy in -1..=1_i64 {
                for dx in -1..=1_i64 {
                    if dx == 0 && dy == 0 {
                        continue;
                    }

                    let nx = (x as i64 + dx) as usize;
                    let ny = (y as i64 + dy) as usize;
                    let neighbor_elev = dem[ny * width + nx];

                    if center_elev >= neighbor_elev {
                        is_sink = false;
                        break;
                    }
                }

                if !is_sink {
                    break;
                }
            }

            if is_sink {
                sinks[idx] = 1;
            }
        }
    }

    Ok(())
}

/// SIMD-accelerated slope computation for flow routing
///
/// Computes maximum downslope gradient for each cell.
///
/// # Arguments
///
/// * `dem` - Digital elevation model
/// * `slope` - Output slopes
/// * `width` - DEM width
/// * `height` - DEM height
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn compute_slope_simd(
    dem: &[f32],
    slope: &mut [f32],
    width: usize,
    height: usize,
) -> Result<()> {
    if width < 3 || height < 3 {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "dimensions",
            message: "Width and height must be at least 3".to_string(),
        });
    }

    if dem.len() != width * height || slope.len() != width * height {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    let sqrt2 = 2.0_f32.sqrt();

    // Process interior pixels
    for y in 1..(height - 1) {
        for x in 1..(width - 1) {
            let idx = y * width + x;
            let center_elev = dem[idx];
            let mut max_slope = 0.0_f32;

            // Check all 8 neighbors
            for dy in -1..=1_i64 {
                for dx in -1..=1_i64 {
                    if dx == 0 && dy == 0 {
                        continue;
                    }

                    let nx = (x as i64 + dx) as usize;
                    let ny = (y as i64 + dy) as usize;
                    let neighbor_elev = dem[ny * width + nx];

                    let distance = if dx.abs() + dy.abs() == 2 { sqrt2 } else { 1.0 };

                    let s = (center_elev - neighbor_elev) / distance;
                    max_slope = max_slope.max(s);
                }
            }

            slope[idx] = max_slope;
        }
    }

    // Handle edges
    for x in 0..width {
        slope[x] = slope[width + x];
        slope[(height - 1) * width + x] = slope[(height - 2) * width + x];
    }

    for y in 0..height {
        slope[y * width] = slope[y * width + 1];
        slope[y * width + width - 1] = slope[y * width + width - 2];
    }

    Ok(())
}

/// SIMD-accelerated flow accumulation initialization
///
/// Initializes flow accumulation buffer with ones (unit catchment area).
///
/// # Arguments
///
/// * `flow_acc` - Flow accumulation buffer to initialize
///
/// # Errors
///
/// Returns an error if the buffer is empty
pub fn initialize_flow_accumulation_simd(flow_acc: &mut [f32]) -> Result<()> {
    if flow_acc.is_empty() {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "flow_acc",
            message: "Buffer must not be empty".to_string(),
        });
    }

    // Initialize all to 1.0 with SIMD
    const LANES: usize = 8;
    let chunks = flow_acc.len() / LANES;

    for i in 0..chunks {
        let start = i * LANES;
        let end = start + LANES;
        for j in start..end {
            flow_acc[j] = 1.0;
        }
    }

    let remainder_start = chunks * LANES;
    for i in remainder_start..flow_acc.len() {
        flow_acc[i] = 1.0;
    }

    Ok(())
}

/// SIMD-accelerated flat area detection
///
/// Detects flat areas (cells with equal elevation to all neighbors).
///
/// # Arguments
///
/// * `dem` - Digital elevation model
/// * `flat` - Output flat mask (1 = flat, 0 = not flat)
/// * `width` - DEM width
/// * `height` - DEM height
/// * `tolerance` - Elevation tolerance for flat detection
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn detect_flats_simd(
    dem: &[f32],
    flat: &mut [u8],
    width: usize,
    height: usize,
    tolerance: f32,
) -> Result<()> {
    if width < 3 || height < 3 {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "dimensions",
            message: "Width and height must be at least 3".to_string(),
        });
    }

    if dem.len() != width * height || flat.len() != width * height {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // Initialize all to 0
    const LANES: usize = 8;
    let chunks = flat.len() / LANES;

    for i in 0..chunks {
        let start = i * LANES;
        let end = start + LANES;
        for j in start..end {
            flat[j] = 0;
        }
    }

    let remainder_start = chunks * LANES;
    for i in remainder_start..flat.len() {
        flat[i] = 0;
    }

    // Check interior pixels
    for y in 1..(height - 1) {
        for x in 1..(width - 1) {
            let idx = y * width + x;
            let center_elev = dem[idx];
            let mut is_flat = true;

            // Check all 8 neighbors
            for dy in -1..=1_i64 {
                for dx in -1..=1_i64 {
                    if dx == 0 && dy == 0 {
                        continue;
                    }

                    let nx = (x as i64 + dx) as usize;
                    let ny = (y as i64 + dy) as usize;
                    let neighbor_elev = dem[ny * width + nx];

                    if (center_elev - neighbor_elev).abs() > tolerance {
                        is_flat = false;
                        break;
                    }
                }

                if !is_flat {
                    break;
                }
            }

            if is_flat {
                flat[idx] = 1;
            }
        }
    }

    Ok(())
}

/// SIMD-accelerated flow accumulation computation using iterative approach
///
/// Computes flow accumulation from flow direction using recursive propagation.
///
/// # Arguments
///
/// * `flow_dir` - D8 flow direction grid
/// * `flow_acc` - Output flow accumulation (should be initialized to 1)
/// * `width` - Grid width
/// * `height` - Grid height
/// * `max_iterations` - Maximum iterations for convergence
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn flow_accumulation_iterative_simd(
    flow_dir: &[u8],
    flow_acc: &mut [f32],
    width: usize,
    height: usize,
    max_iterations: usize,
) -> Result<()> {
    if width < 3 || height < 3 {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "dimensions",
            message: "Width and height must be at least 3".to_string(),
        });
    }

    if flow_dir.len() != width * height || flow_acc.len() != width * height {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // D8 direction offsets (dx, dy) for each code
    let dir_offsets: [(i64, i64, u8); 8] = [
        (1, 0, D8_EAST),
        (1, 1, D8_SOUTHEAST),
        (0, 1, D8_SOUTH),
        (-1, 1, D8_SOUTHWEST),
        (-1, 0, D8_WEST),
        (-1, -1, D8_NORTHWEST),
        (0, -1, D8_NORTH),
        (1, -1, D8_NORTHEAST),
    ];

    let mut temp_acc = vec![0.0_f32; width * height];

    // Iterative flow accumulation
    for _ in 0..max_iterations {
        // Reset temp accumulation
        for val in temp_acc.iter_mut() {
            *val = 1.0;
        }

        // Add upstream contributions
        for y in 1..(height - 1) {
            for x in 1..(width - 1) {
                let idx = y * width + x;
                let center_dir = flow_dir[idx];

                // Find the target cell this one flows to
                for &(dx, dy, code) in &dir_offsets {
                    if center_dir == code {
                        let tx = (x as i64 + dx) as usize;
                        let ty = (y as i64 + dy) as usize;
                        if tx > 0 && tx < width - 1 && ty > 0 && ty < height - 1 {
                            let target_idx = ty * width + tx;
                            temp_acc[target_idx] += flow_acc[idx];
                        }
                        break;
                    }
                }
            }
        }

        // Copy back and check for convergence
        let mut converged = true;
        for i in 0..(width * height) {
            if (temp_acc[i] - flow_acc[i]).abs() > 0.001 {
                converged = false;
            }
            flow_acc[i] = temp_acc[i];
        }

        if converged {
            break;
        }
    }

    Ok(())
}

/// SIMD-accelerated upstream count computation
///
/// Counts how many cells flow into each cell.
///
/// # Arguments
///
/// * `flow_dir` - D8 flow direction grid
/// * `upstream_count` - Output upstream count
/// * `width` - Grid width
/// * `height` - Grid height
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn count_upstream_cells_simd(
    flow_dir: &[u8],
    upstream_count: &mut [u32],
    width: usize,
    height: usize,
) -> Result<()> {
    if width < 3 || height < 3 {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "dimensions",
            message: "Width and height must be at least 3".to_string(),
        });
    }

    if flow_dir.len() != width * height || upstream_count.len() != width * height {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // Initialize to 0
    for val in upstream_count.iter_mut() {
        *val = 0;
    }

    // D8 direction offsets (dx, dy) for each code
    let dir_offsets: [(i64, i64, u8); 8] = [
        (1, 0, D8_EAST),
        (1, 1, D8_SOUTHEAST),
        (0, 1, D8_SOUTH),
        (-1, 1, D8_SOUTHWEST),
        (-1, 0, D8_WEST),
        (-1, -1, D8_NORTHWEST),
        (0, -1, D8_NORTH),
        (1, -1, D8_NORTHEAST),
    ];

    // Count upstream cells
    for y in 1..(height - 1) {
        for x in 1..(width - 1) {
            let idx = y * width + x;
            let center_dir = flow_dir[idx];

            // Find target cell
            for &(dx, dy, code) in &dir_offsets {
                if center_dir == code {
                    let tx = (x as i64 + dx) as usize;
                    let ty = (y as i64 + dy) as usize;
                    if tx > 0 && tx < width - 1 && ty > 0 && ty < height - 1 {
                        let target_idx = ty * width + tx;
                        upstream_count[target_idx] += 1;
                    }
                    break;
                }
            }
        }
    }

    Ok(())
}

/// SIMD-accelerated stream network extraction from flow accumulation
///
/// Extracts stream network by thresholding flow accumulation.
///
/// # Arguments
///
/// * `flow_acc` - Flow accumulation grid
/// * `streams` - Output stream network (1 = stream, 0 = not stream)
/// * `width` - Grid width
/// * `height` - Grid height
/// * `threshold` - Flow accumulation threshold for streams
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn extract_streams_simd(
    flow_acc: &[f32],
    streams: &mut [u8],
    width: usize,
    height: usize,
    threshold: f32,
) -> Result<()> {
    if flow_acc.len() != width * height || streams.len() != width * height {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // Extract streams using SIMD-friendly pattern
    const LANES: usize = 8;
    let chunks = flow_acc.len() / LANES;

    for chunk in 0..chunks {
        let start = chunk * LANES;
        let end = start + LANES;

        for i in start..end {
            streams[i] = if flow_acc[i] >= threshold { 1 } else { 0 };
        }
    }

    let remainder_start = chunks * LANES;
    for i in remainder_start..flow_acc.len() {
        streams[i] = if flow_acc[i] >= threshold { 1 } else { 0 };
    }

    Ok(())
}

/// SIMD-accelerated priority flood fill for sink removal
///
/// Uses a simplified priority flood algorithm to fill sinks.
///
/// # Arguments
///
/// * `dem` - Digital elevation model (modified in place)
/// * `width` - DEM width
/// * `height` - DEM height
/// * `epsilon` - Small increment for enforcing drainage
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn fill_sinks_simple_simd(
    dem: &mut [f32],
    width: usize,
    height: usize,
    epsilon: f32,
) -> Result<()> {
    if width < 3 || height < 3 {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "dimensions",
            message: "Width and height must be at least 3".to_string(),
        });
    }

    if dem.len() != width * height {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer size must match width * height".to_string(),
        });
    }

    // Simple iterative fill
    let max_iterations = (width + height) * 2;
    let mut changed = true;
    let mut iteration = 0;

    while changed && iteration < max_iterations {
        changed = false;
        iteration += 1;

        for y in 1..(height - 1) {
            for x in 1..(width - 1) {
                let idx = y * width + x;
                let center_elev = dem[idx];

                // Find minimum neighbor elevation
                let mut min_neighbor = f32::INFINITY;
                for dy in -1..=1_i64 {
                    for dx in -1..=1_i64 {
                        if dx == 0 && dy == 0 {
                            continue;
                        }
                        let nx = (x as i64 + dx) as usize;
                        let ny = (y as i64 + dy) as usize;
                        min_neighbor = min_neighbor.min(dem[ny * width + nx]);
                    }
                }

                // If cell is lower than all neighbors, raise it
                if center_elev < min_neighbor {
                    dem[idx] = min_neighbor + epsilon;
                    changed = true;
                }
            }
        }
    }

    Ok(())
}

/// SIMD-accelerated Topographic Wetness Index (TWI) computation
///
/// Computes TWI = ln(a / tan(b)) where a = specific catchment area, b = slope.
///
/// # Arguments
///
/// * `flow_acc` - Flow accumulation (specific catchment area)
/// * `slope` - Slope in radians
/// * `twi` - Output TWI values
/// * `width` - Grid width
/// * `height` - Grid height
/// * `cell_size` - Size of each cell
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn compute_twi_simd(
    flow_acc: &[f32],
    slope: &[f32],
    twi: &mut [f32],
    width: usize,
    height: usize,
    cell_size: f32,
) -> Result<()> {
    if flow_acc.len() != width * height
        || slope.len() != width * height
        || twi.len() != width * height
    {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // Compute TWI using SIMD-friendly pattern
    const LANES: usize = 8;
    let chunks = flow_acc.len() / LANES;

    for chunk in 0..chunks {
        let start = chunk * LANES;
        let end = start + LANES;

        for i in start..end {
            // Specific catchment area
            let sca = flow_acc[i] * cell_size;

            // Slope in radians (convert from degrees if necessary)
            let slope_rad = slope[i].to_radians();
            let tan_slope = slope_rad.tan().max(0.0001); // Avoid division by zero

            // TWI = ln(a / tan(b))
            twi[i] = (sca / tan_slope).ln();
        }
    }

    let remainder_start = chunks * LANES;
    for i in remainder_start..flow_acc.len() {
        let sca = flow_acc[i] * cell_size;
        let slope_rad = slope[i].to_radians();
        let tan_slope = slope_rad.tan().max(0.0001);
        twi[i] = (sca / tan_slope).ln();
    }

    Ok(())
}

/// SIMD-accelerated Stream Power Index (SPI) computation
///
/// Computes SPI = a * tan(b) where a = specific catchment area, b = slope.
///
/// # Arguments
///
/// * `flow_acc` - Flow accumulation (specific catchment area)
/// * `slope` - Slope in radians
/// * `spi` - Output SPI values
/// * `width` - Grid width
/// * `height` - Grid height
/// * `cell_size` - Size of each cell
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn compute_spi_simd(
    flow_acc: &[f32],
    slope: &[f32],
    spi: &mut [f32],
    width: usize,
    height: usize,
    cell_size: f32,
) -> Result<()> {
    if flow_acc.len() != width * height
        || slope.len() != width * height
        || spi.len() != width * height
    {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // Compute SPI using SIMD-friendly pattern
    const LANES: usize = 8;
    let chunks = flow_acc.len() / LANES;

    for chunk in 0..chunks {
        let start = chunk * LANES;
        let end = start + LANES;

        for i in start..end {
            // Specific catchment area
            let sca = flow_acc[i] * cell_size;

            // Slope in radians
            let slope_rad = slope[i].to_radians();
            let tan_slope = slope_rad.tan();

            // SPI = a * tan(b)
            spi[i] = sca * tan_slope;
        }
    }

    let remainder_start = chunks * LANES;
    for i in remainder_start..flow_acc.len() {
        let sca = flow_acc[i] * cell_size;
        let slope_rad = slope[i].to_radians();
        let tan_slope = slope_rad.tan();
        spi[i] = sca * tan_slope;
    }

    Ok(())
}

/// SIMD-accelerated Sediment Transport Index (STI) computation
///
/// Computes STI = (a/22.13)^0.6 * (sin(b)/0.0896)^1.3
///
/// # Arguments
///
/// * `flow_acc` - Flow accumulation (specific catchment area)
/// * `slope` - Slope in degrees
/// * `sti` - Output STI values
/// * `width` - Grid width
/// * `height` - Grid height
/// * `cell_size` - Size of each cell
///
/// # Errors
///
/// Returns an error if parameters are invalid
pub fn compute_sti_simd(
    flow_acc: &[f32],
    slope: &[f32],
    sti: &mut [f32],
    width: usize,
    height: usize,
    cell_size: f32,
) -> Result<()> {
    if flow_acc.len() != width * height
        || slope.len() != width * height
        || sti.len() != width * height
    {
        return Err(AlgorithmError::InvalidParameter {
            parameter: "buffer_size",
            message: "Buffer sizes must match width * height".to_string(),
        });
    }

    // USLE reference values
    let ref_length = 22.13_f32;
    let ref_slope = 0.0896_f32;

    // Compute STI using SIMD-friendly pattern
    const LANES: usize = 8;
    let chunks = flow_acc.len() / LANES;

    for chunk in 0..chunks {
        let start = chunk * LANES;
        let end = start + LANES;

        for i in start..end {
            // Flow length approximation
            let flow_length = flow_acc[i] * cell_size;

            // Slope in radians
            let slope_rad = slope[i].to_radians();
            let sin_slope = slope_rad.sin().max(0.0001);

            // STI formula
            let length_factor = (flow_length / ref_length).powf(0.6);
            let slope_factor = (sin_slope / ref_slope).powf(1.3);
            sti[i] = length_factor * slope_factor;
        }
    }

    let remainder_start = chunks * LANES;
    for i in remainder_start..flow_acc.len() {
        let flow_length = flow_acc[i] * cell_size;
        let slope_rad = slope[i].to_radians();
        let sin_slope = slope_rad.sin().max(0.0001);
        let length_factor = (flow_length / ref_length).powf(0.6);
        let slope_factor = (sin_slope / ref_slope).powf(1.3);
        sti[i] = length_factor * slope_factor;
    }

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_flow_direction_d8() {
        let mut dem = vec![0.0_f32; 100];

        // Create a simple slope (increasing elevation west to east)
        for y in 0..10 {
            for x in 0..10 {
                dem[y * 10 + x] = x as f32;
            }
        }

        let mut flow_dir = vec![0_u8; 100];
        flow_direction_d8_simd(&dem, &mut flow_dir, 10, 10)
            .expect("Failed to compute D8 flow direction");

        // Interior cells should flow west (downslope)
        for y in 1..9 {
            for x in 1..9 {
                let dir = flow_dir[y * 10 + x];
                // Should be WEST, SOUTHWEST, or NORTHWEST
                assert!(dir == D8_WEST || dir == D8_SOUTHWEST || dir == D8_NORTHWEST);
            }
        }
    }

    #[test]
    fn test_detect_sinks() {
        let mut dem = vec![100.0_f32; 100];
        dem[55] = 50.0; // Create a sink

        let mut sinks = vec![0_u8; 100];
        detect_sinks_simd(&dem, &mut sinks, 10, 10).expect("Failed to detect sinks");

        // Pixel at 55 should be detected as sink
        assert_eq!(sinks[55], 1);
    }

    #[test]
    fn test_compute_slope() {
        let dem = vec![100.0_f32; 100]; // Flat surface
        let mut slope = vec![0.0_f32; 100];

        compute_slope_simd(&dem, &mut slope, 10, 10).expect("Failed to compute slope");

        // Flat surface should have zero slope
        for &val in &slope[11..89] {
            assert!(val < 0.01);
        }
    }

    #[test]
    fn test_initialize_flow_accumulation() {
        let mut flow_acc = vec![0.0_f32; 100];
        initialize_flow_accumulation_simd(&mut flow_acc)
            .expect("Failed to initialize flow accumulation");

        // All values should be 1.0
        for &val in &flow_acc {
            assert_eq!(val, 1.0);
        }
    }

    #[test]
    fn test_detect_flats() {
        let dem = vec![100.0_f32; 100]; // Uniform elevation
        let mut flat = vec![0_u8; 100];

        detect_flats_simd(&dem, &mut flat, 10, 10, 0.1).expect("Failed to detect flats");

        // All interior pixels should be detected as flat
        for y in 1..9 {
            for x in 1..9 {
                assert_eq!(flat[y * 10 + x], 1);
            }
        }
    }

    #[test]
    fn test_invalid_dimensions() {
        let dem = vec![100.0_f32; 4]; // 2x2 (too small)
        let mut flow_dir = vec![0_u8; 4];

        let result = flow_direction_d8_simd(&dem, &mut flow_dir, 2, 2);
        assert!(result.is_err());
    }

    #[test]
    fn test_buffer_size_mismatch() {
        let dem = vec![100.0_f32; 100];
        let mut flow_dir = vec![0_u8; 50]; // Wrong size

        let result = flow_direction_d8_simd(&dem, &mut flow_dir, 10, 10);
        assert!(result.is_err());
    }

    #[test]
    fn test_count_upstream_cells() {
        let flow_dir = vec![D8_WEST; 100]; // All flow west
        let mut upstream_count = vec![0_u32; 100];

        count_upstream_cells_simd(&flow_dir, &mut upstream_count, 10, 10)
            .expect("Failed to count upstream cells");

        // Interior cells should have upstream counts
        assert!(upstream_count.iter().any(|&v| v > 0));
    }

    #[test]
    fn test_extract_streams() {
        let mut flow_acc = vec![10.0_f32; 100];
        flow_acc[55] = 100.0; // High flow accumulation

        let mut streams = vec![0_u8; 100];
        extract_streams_simd(&flow_acc, &mut streams, 10, 10, 50.0)
            .expect("Failed to extract streams");

        // Only pixel 55 should be a stream
        assert_eq!(streams[55], 1);
        assert_eq!(streams[44], 0);
    }

    #[test]
    fn test_fill_sinks_simple() {
        let mut dem = vec![100.0_f32; 100];
        dem[55] = 50.0; // Create a sink

        fill_sinks_simple_simd(&mut dem, 10, 10, 0.001).expect("Failed to fill sinks");

        // Sink should be filled
        assert!(dem[55] >= 100.0);
    }

    #[test]
    fn test_compute_twi() {
        let flow_acc = vec![10.0_f32; 100];
        let slope = vec![10.0_f32; 100]; // 10 degrees
        let mut twi = vec![0.0_f32; 100];

        compute_twi_simd(&flow_acc, &slope, &mut twi, 10, 10, 30.0).expect("Failed to compute TWI");

        // TWI should be finite
        for &val in &twi {
            assert!(val.is_finite());
        }
    }

    #[test]
    fn test_compute_spi() {
        let flow_acc = vec![10.0_f32; 100];
        let slope = vec![10.0_f32; 100]; // 10 degrees
        let mut spi = vec![0.0_f32; 100];

        compute_spi_simd(&flow_acc, &slope, &mut spi, 10, 10, 30.0).expect("Failed to compute SPI");

        // SPI should be finite and non-negative
        for &val in &spi {
            assert!(val.is_finite());
        }
    }

    #[test]
    fn test_compute_sti() {
        let flow_acc = vec![10.0_f32; 100];
        let slope = vec![10.0_f32; 100]; // 10 degrees
        let mut sti = vec![0.0_f32; 100];

        compute_sti_simd(&flow_acc, &slope, &mut sti, 10, 10, 30.0).expect("Failed to compute STI");

        // STI should be finite and non-negative
        for &val in &sti {
            assert!(val.is_finite() && val >= 0.0);
        }
    }
}