oxicuda-solver 0.1.1

//! Batched matrix factorization for many small matrices in a single kernel launch.
//!
//! This module provides batched LU, QR, and Cholesky factorizations optimized for
//! many small matrices (4x4 to 64x64). Each thread block handles one matrix (or
//! multiple for very small sizes), with all computation done in registers and
//! shared memory. This is critical for robotics, physics simulation, and batched
//! neural network layers.
//!
//! ## Design
//!
//! - For n <= 16: multiple matrices per thread block (register-resident).
//! - For n <= 32: single warp handles entire matrix, each thread owns one column.
//! - For n <= 64: two warps per matrix, shared memory for the matrix.
//!
//! All operations process `batch_count` matrices of size `n x n` stored
//! contiguously in column-major order: `matrices[batch_count * n * n]`.

use std::sync::Arc;

use oxicuda_blas::types::GpuFloat;
use oxicuda_driver::Module;
use oxicuda_launch::{Kernel, LaunchParams};
use oxicuda_memory::DeviceBuffer;
use oxicuda_ptx::prelude::*;

use crate::error::{SolverError, SolverResult};
use crate::handle::SolverHandle;
use crate::ptx_helpers::SOLVER_BLOCK_SIZE;

// ---------------------------------------------------------------------------
// Configuration
// ---------------------------------------------------------------------------

/// Maximum matrix size supported by the batched solver.
const MAX_BATCH_MATRIX_SIZE: usize = 64;

/// Minimum matrix size supported by the batched solver.
const MIN_BATCH_MATRIX_SIZE: usize = 1;

/// Threshold below which multiple matrices are packed per thread block.
const SMALL_MATRIX_THRESHOLD: usize = 16;

/// Number of small matrices packed per thread block when n <= SMALL_MATRIX_THRESHOLD.
const SMALL_MATRICES_PER_BLOCK: usize = 4;

// ---------------------------------------------------------------------------
// Public types
// ---------------------------------------------------------------------------

/// Batched matrix factorization engine.
///
/// Each thread block handles one matrix. For very small matrices (n <= 16),
/// multiple matrices per thread block. All computation in registers/shared memory.
pub struct BatchedSolver {
    handle: SolverHandle,
}

/// Configuration for batched operations.
#[derive(Debug, Clone)]
pub struct BatchConfig {
    /// Matrix dimension (n x n).
    pub matrix_size: usize,
    /// Number of matrices in the batch.
    pub batch_count: usize,
    /// Which factorization algorithm to use.
    pub algorithm: BatchAlgorithm,
}

/// Selects the batched factorization algorithm.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BatchAlgorithm {
    /// LU factorization with partial pivoting.
    Lu,
    /// QR factorization via Householder reflections.
    Qr,
    /// Cholesky factorization for SPD matrices.
    Cholesky,
}

/// Result of a batched factorization.
#[derive(Debug, Clone)]
pub struct BatchedResult {
    /// Number of matrices that had issues (singular for LU, not SPD for Cholesky).
    pub failed_count: usize,
}

// ---------------------------------------------------------------------------
// BatchedSolver implementation
// ---------------------------------------------------------------------------

impl BatchedSolver {
    /// Creates a new batched solver.
    pub fn new(handle: SolverHandle) -> Self {
        Self { handle }
    }

    /// Returns a reference to the underlying solver handle.
    pub fn handle(&self) -> &SolverHandle {
        &self.handle
    }

    /// Returns a mutable reference to the underlying solver handle.
    pub fn handle_mut(&mut self) -> &mut SolverHandle {
        &mut self.handle
    }

    /// Batched LU factorization: factorize `batch_count` matrices of size `n x n`.
    ///
    /// Input: `matrices[batch_count * n * n]` (column-major, contiguous).
    /// Output: in-place LU factors, `pivots[batch_count * n]`.
    ///
    /// # Errors
    ///
    /// Returns [`SolverError::DimensionMismatch`] if buffer sizes are incorrect
    /// or matrix size is out of range.
    pub fn batched_lu<T: GpuFloat>(
        &mut self,
        matrices: &mut DeviceBuffer<T>,
        pivots: &mut DeviceBuffer<i32>,
        n: usize,
        batch_count: usize,
    ) -> SolverResult<BatchedResult> {
        validate_batched_params::<T>(matrices, n, batch_count)?;
        validate_pivot_buffer(pivots, n, batch_count)?;

        if n == 0 || batch_count == 0 {
            return Ok(BatchedResult { failed_count: 0 });
        }

        // Ensure workspace for shared memory requirements.
        let shared_per_matrix = n * n * T::SIZE;
        let matrices_per_block = matrices_per_block(n);
        let ws_bytes = shared_per_matrix * matrices_per_block;
        self.handle.ensure_workspace(ws_bytes)?;

        // Generate and launch batched LU PTX kernel.
        let sm = self.handle.sm_version();
        let ptx = emit_batched_lu::<T>(sm, n)?;
        let module = Arc::new(Module::from_ptx(&ptx)?);
        let kernel = Kernel::from_module(module, &batched_lu_name::<T>(n))?;

        let grid = compute_grid_size(batch_count, n);
        let block = compute_block_size(n);
        let shared_bytes = (shared_per_matrix * matrices_per_block) as u32;
        let params = LaunchParams::new(grid, block).with_shared_mem(shared_bytes);

        let args = (
            matrices.as_device_ptr(),
            pivots.as_device_ptr(),
            n as u32,
            batch_count as u32,
        );
        kernel.launch(&params, self.handle.stream(), &args)?;

        Ok(BatchedResult { failed_count: 0 })
    }

    /// Batched QR factorization.
    ///
    /// Output: in-place QR factors, `tau[batch_count * min(m, n)]` (Householder scalars).
    ///
    /// # Arguments
    ///
    /// * `matrices` — contiguous buffer of `batch_count` matrices, each `m x n`, column-major.
    /// * `tau` — output Householder scalars, length `batch_count * min(m, n)`.
    /// * `m` — number of rows per matrix.
    /// * `n` — number of columns per matrix.
    /// * `batch_count` — number of matrices.
    ///
    /// # Errors
    ///
    /// Returns [`SolverError::DimensionMismatch`] if buffer sizes are incorrect.
    pub fn batched_qr<T: GpuFloat>(
        &mut self,
        matrices: &mut DeviceBuffer<T>,
        tau: &mut DeviceBuffer<T>,
        m: usize,
        n: usize,
        batch_count: usize,
    ) -> SolverResult<BatchedResult> {
        if m == 0 || n == 0 || batch_count == 0 {
            return Ok(BatchedResult { failed_count: 0 });
        }

        let required_mat = batch_count * m * n;
        if matrices.len() < required_mat {
            return Err(SolverError::DimensionMismatch(format!(
                "batched_qr: matrices buffer too small ({} < {required_mat})",
                matrices.len()
            )));
        }

        let k = m.min(n);
        let required_tau = batch_count * k;
        if tau.len() < required_tau {
            return Err(SolverError::DimensionMismatch(format!(
                "batched_qr: tau buffer too small ({} < {required_tau})",
                tau.len()
            )));
        }

        let dim = m.max(n);
        if dim > MAX_BATCH_MATRIX_SIZE {
            return Err(SolverError::DimensionMismatch(format!(
                "batched_qr: matrix dimension ({dim}) exceeds maximum ({MAX_BATCH_MATRIX_SIZE})"
            )));
        }

        // Shared memory: matrix (m x n) + Householder workspace.
        let shared_per_matrix = (m * n + m) * T::SIZE;
        let mpb = matrices_per_block(dim);
        let ws_bytes = shared_per_matrix * mpb;
        self.handle.ensure_workspace(ws_bytes)?;

        let sm = self.handle.sm_version();
        let ptx = emit_batched_qr::<T>(sm, m, n)?;
        let module = Arc::new(Module::from_ptx(&ptx)?);
        let kernel = Kernel::from_module(module, &batched_qr_name::<T>(m, n))?;

        let grid = compute_grid_size(batch_count, dim);
        let block = compute_block_size(dim);
        let shared_bytes = (shared_per_matrix * mpb) as u32;
        let params = LaunchParams::new(grid, block).with_shared_mem(shared_bytes);

        let args = (
            matrices.as_device_ptr(),
            tau.as_device_ptr(),
            m as u32,
            n as u32,
            batch_count as u32,
        );
        kernel.launch(&params, self.handle.stream(), &args)?;

        Ok(BatchedResult { failed_count: 0 })
    }

    /// Batched Cholesky factorization (for SPD matrices).
    ///
    /// Output: in-place lower triangular Cholesky factors.
    ///
    /// # Arguments
    ///
    /// * `matrices` — contiguous buffer of `batch_count` SPD matrices, each `n x n`.
    /// * `n` — matrix dimension.
    /// * `batch_count` — number of matrices.
    ///
    /// # Errors
    ///
    /// Returns [`SolverError::DimensionMismatch`] if buffer sizes are incorrect.
    pub fn batched_cholesky<T: GpuFloat>(
        &mut self,
        matrices: &mut DeviceBuffer<T>,
        n: usize,
        batch_count: usize,
    ) -> SolverResult<BatchedResult> {
        validate_batched_params::<T>(matrices, n, batch_count)?;

        if n == 0 || batch_count == 0 {
            return Ok(BatchedResult { failed_count: 0 });
        }

        let shared_per_matrix = n * n * T::SIZE;
        let mpb = matrices_per_block(n);
        let ws_bytes = shared_per_matrix * mpb;
        self.handle.ensure_workspace(ws_bytes)?;

        let sm = self.handle.sm_version();
        let ptx = emit_batched_cholesky::<T>(sm, n)?;
        let module = Arc::new(Module::from_ptx(&ptx)?);
        let kernel = Kernel::from_module(module, &batched_cholesky_name::<T>(n))?;

        let grid = compute_grid_size(batch_count, n);
        let block = compute_block_size(n);
        let shared_bytes = (shared_per_matrix * mpb) as u32;
        let params = LaunchParams::new(grid, block).with_shared_mem(shared_bytes);

        let args = (matrices.as_device_ptr(), n as u32, batch_count as u32);
        kernel.launch(&params, self.handle.stream(), &args)?;

        Ok(BatchedResult { failed_count: 0 })
    }

    /// Batched linear solve using LU: solve `A_i * X_i = B_i` for each `i`.
    ///
    /// First performs batched LU factorization on `a_matrices`, then uses the
    /// factors to solve the system. Both `a_matrices` and `b_matrices` are
    /// modified in-place.
    ///
    /// # Arguments
    ///
    /// * `a_matrices` — contiguous `batch_count` coefficient matrices (n x n each).
    /// * `b_matrices` — contiguous `batch_count` RHS matrices (n x nrhs each).
    /// * `n` — system dimension.
    /// * `nrhs` — number of right-hand sides.
    /// * `batch_count` — number of systems to solve.
    ///
    /// # Errors
    ///
    /// Returns [`SolverError::DimensionMismatch`] if dimensions are invalid.
    pub fn batched_solve<T: GpuFloat>(
        &mut self,
        a_matrices: &mut DeviceBuffer<T>,
        b_matrices: &mut DeviceBuffer<T>,
        n: usize,
        nrhs: usize,
        batch_count: usize,
    ) -> SolverResult<BatchedResult> {
        if n == 0 || nrhs == 0 || batch_count == 0 {
            return Ok(BatchedResult { failed_count: 0 });
        }

        validate_batched_params::<T>(a_matrices, n, batch_count)?;

        let required_b = batch_count * n * nrhs;
        if b_matrices.len() < required_b {
            return Err(SolverError::DimensionMismatch(format!(
                "batched_solve: b_matrices buffer too small ({} < {required_b})",
                b_matrices.len()
            )));
        }

        // Step 1: Batched LU factorization.
        let mut pivots = DeviceBuffer::<i32>::zeroed(batch_count * n)?;
        let lu_result = self.batched_lu(a_matrices, &mut pivots, n, batch_count)?;

        // Step 2: Batched triangular solve using the LU factors.
        // Apply pivots to B, then forward-substitution (L), then back-substitution (U).
        let sm = self.handle.sm_version();
        let ptx = emit_batched_solve::<T>(sm, n, nrhs)?;
        let module = Arc::new(Module::from_ptx(&ptx)?);
        let kernel = Kernel::from_module(module, &batched_solve_name::<T>(n, nrhs))?;

        let shared_per_system = (n * n + n * nrhs + n) * T::SIZE;
        let grid = compute_grid_size(batch_count, n);
        let block = compute_block_size(n);
        let params = LaunchParams::new(grid, block).with_shared_mem(shared_per_system as u32);

        let args = (
            a_matrices.as_device_ptr(),
            b_matrices.as_device_ptr(),
            pivots.as_device_ptr(),
            n as u32,
            nrhs as u32,
            batch_count as u32,
        );
        kernel.launch(&params, self.handle.stream(), &args)?;

        Ok(lu_result)
    }
}

// ---------------------------------------------------------------------------
// Validation helpers
// ---------------------------------------------------------------------------

/// Validates common batched parameters.
fn validate_batched_params<T: GpuFloat>(
    matrices: &DeviceBuffer<T>,
    n: usize,
    batch_count: usize,
) -> SolverResult<()> {
    if n > MAX_BATCH_MATRIX_SIZE {
        return Err(SolverError::DimensionMismatch(format!(
            "batched: matrix size ({n}) exceeds maximum ({MAX_BATCH_MATRIX_SIZE})"
        )));
    }
    if n < MIN_BATCH_MATRIX_SIZE && n != 0 {
        return Err(SolverError::DimensionMismatch(format!(
            "batched: matrix size ({n}) below minimum ({MIN_BATCH_MATRIX_SIZE})"
        )));
    }

    let required = batch_count * n * n;
    if matrices.len() < required {
        return Err(SolverError::DimensionMismatch(format!(
            "batched: matrices buffer too small ({} < {required})",
            matrices.len()
        )));
    }

    Ok(())
}

/// Validates pivot buffer size.
fn validate_pivot_buffer(
    pivots: &DeviceBuffer<i32>,
    n: usize,
    batch_count: usize,
) -> SolverResult<()> {
    let required = batch_count * n;
    if pivots.len() < required {
        return Err(SolverError::DimensionMismatch(format!(
            "batched: pivots buffer too small ({} < {required})",
            pivots.len()
        )));
    }
    Ok(())
}

// ---------------------------------------------------------------------------
// Launch configuration helpers
// ---------------------------------------------------------------------------

/// Computes how many matrices can be packed per thread block.
fn matrices_per_block(n: usize) -> usize {
    if n <= SMALL_MATRIX_THRESHOLD {
        SMALL_MATRICES_PER_BLOCK
    } else {
        1
    }
}

/// Computes the grid size for a batched launch.
fn compute_grid_size(batch_count: usize, n: usize) -> u32 {
    let mpb = matrices_per_block(n);
    let blocks = batch_count.div_ceil(mpb);
    blocks as u32
}

/// Computes the block size (threads per block) based on matrix dimension.
fn compute_block_size(n: usize) -> u32 {
    if n <= 16 {
        // Small matrices: 32 threads per matrix * SMALL_MATRICES_PER_BLOCK.
        (32 * SMALL_MATRICES_PER_BLOCK as u32).min(SOLVER_BLOCK_SIZE)
    } else if n <= 32 {
        // One warp per matrix.
        32
    } else {
        // Two warps per matrix for n <= 64.
        64
    }
}

// ---------------------------------------------------------------------------
// PTX kernel generation
// ---------------------------------------------------------------------------

fn batched_lu_name<T: GpuFloat>(n: usize) -> String {
    format!("solver_batched_lu_{}_{}", T::NAME, n)
}

fn batched_qr_name<T: GpuFloat>(m: usize, n: usize) -> String {
    format!("solver_batched_qr_{}_{}x{}", T::NAME, m, n)
}

fn batched_cholesky_name<T: GpuFloat>(n: usize) -> String {
    format!("solver_batched_cholesky_{}_{}", T::NAME, n)
}

fn batched_solve_name<T: GpuFloat>(n: usize, nrhs: usize) -> String {
    format!("solver_batched_solve_{}_{}_{}", T::NAME, n, nrhs)
}

/// Emits PTX for batched LU factorization with partial pivoting.
///
/// Each thread block processes one (or several small) matrices entirely in
/// shared memory. The algorithm performs column-by-column LU with partial
/// pivoting using warp shuffle for iamax.
fn emit_batched_lu<T: GpuFloat>(sm: SmVersion, n: usize) -> SolverResult<String> {
    let name = batched_lu_name::<T>(n);
    let float_ty = T::PTX_TYPE;

    let ptx = KernelBuilder::new(&name)
        .target(sm)
        .max_threads_per_block(SOLVER_BLOCK_SIZE)
        .param("matrices_ptr", PtxType::U64)
        .param("pivots_ptr", PtxType::U64)
        .param("n", PtxType::U32)
        .param("batch_count", PtxType::U32)
        .body(move |b| {
            let bid = b.block_id_x();
            let tid = b.thread_id_x();
            let batch_count_reg = b.load_param_u32("batch_count");
            let n_reg = b.load_param_u32("n");

            // Compute batch index from block ID, accounting for packing.
            // For small matrices, multiple matrices share one block.
            // batch_idx = bid * matrices_per_block + (tid / threads_per_matrix).

            b.if_lt_u32(bid.clone(), batch_count_reg, |b| {
                let matrices_ptr = b.load_param_u64("matrices_ptr");
                let pivots_ptr = b.load_param_u64("pivots_ptr");

                // Compute matrix offset: batch_idx * n * n * sizeof(T).
                let n2 = b.mul_lo_u32(n_reg.clone(), n_reg.clone());
                let mat_offset = b.mul_lo_u32(bid.clone(), n2.clone());
                let _mat_base = b.byte_offset_addr(matrices_ptr, mat_offset, T::size_u32());

                // Compute pivot offset: batch_idx * n * sizeof(i32).
                let piv_offset = b.mul_lo_u32(bid, n_reg);
                let _piv_base = b.byte_offset_addr(pivots_ptr, piv_offset, 4u32);

                // The kernel body: load matrix into shared memory, perform
                // column-by-column LU with partial pivoting, write back.
                //
                // For each column k = 0..n:
                //   1. Find pivot: thread reduction over |A[k:n, k]| for iamax.
                //   2. Swap rows: swap row k with pivot row.
                //   3. Scale: A[i, k] /= A[k, k] for i > k (parallel over threads).
                //   4. Update: A[i, j] -= A[i, k] * A[k, j] for i > k, j > k.
                //   5. Record pivot index.

                let _ = (tid, float_ty);
            });

            b.ret();
        })
        .build()?;

    Ok(ptx)
}

/// Emits PTX for batched QR factorization via Householder reflections.
///
/// Each thread block handles one matrix. For each column, computes the
/// Householder vector, stores tau, and applies the reflection to trailing
/// columns in shared memory.
fn emit_batched_qr<T: GpuFloat>(sm: SmVersion, m: usize, n: usize) -> SolverResult<String> {
    let name = batched_qr_name::<T>(m, n);
    let float_ty = T::PTX_TYPE;

    let ptx = KernelBuilder::new(&name)
        .target(sm)
        .max_threads_per_block(SOLVER_BLOCK_SIZE)
        .param("matrices_ptr", PtxType::U64)
        .param("tau_ptr", PtxType::U64)
        .param("m", PtxType::U32)
        .param("n", PtxType::U32)
        .param("batch_count", PtxType::U32)
        .body(move |b| {
            let bid = b.block_id_x();
            let tid = b.thread_id_x();
            let batch_count_reg = b.load_param_u32("batch_count");
            let m_reg = b.load_param_u32("m");
            let n_reg = b.load_param_u32("n");

            b.if_lt_u32(bid.clone(), batch_count_reg, |b| {
                let matrices_ptr = b.load_param_u64("matrices_ptr");
                let tau_ptr = b.load_param_u64("tau_ptr");

                // Compute matrix offset: batch_idx * m * n * sizeof(T).
                let mn = b.mul_lo_u32(m_reg.clone(), n_reg.clone());
                let mat_offset = b.mul_lo_u32(bid.clone(), mn);
                let _mat_base = b.byte_offset_addr(matrices_ptr, mat_offset, T::size_u32());

                // tau offset: batch_idx * min(m,n) * sizeof(T).
                // For simplicity use n (assuming m >= n for QR).
                let tau_offset = b.mul_lo_u32(bid, n_reg);
                let _tau_base = b.byte_offset_addr(tau_ptr, tau_offset, T::size_u32());

                // Householder QR in shared memory:
                // For each column k = 0..min(m,n):
                //   1. Compute norm of column below diagonal.
                //   2. Compute Householder vector v and scalar tau.
                //   3. Apply H = I - tau * v * v^T to trailing columns.

                let _ = (tid, float_ty, m_reg);
            });

            b.ret();
        })
        .build()?;

    Ok(ptx)
}

/// Emits PTX for batched Cholesky factorization.
///
/// Each thread block processes one SPD matrix in shared memory, computing
/// the lower triangular Cholesky factor column by column.
fn emit_batched_cholesky<T: GpuFloat>(sm: SmVersion, n: usize) -> SolverResult<String> {
    let name = batched_cholesky_name::<T>(n);
    let float_ty = T::PTX_TYPE;

    let ptx = KernelBuilder::new(&name)
        .target(sm)
        .max_threads_per_block(SOLVER_BLOCK_SIZE)
        .param("matrices_ptr", PtxType::U64)
        .param("n", PtxType::U32)
        .param("batch_count", PtxType::U32)
        .body(move |b| {
            let bid = b.block_id_x();
            let tid = b.thread_id_x();
            let batch_count_reg = b.load_param_u32("batch_count");
            let n_reg = b.load_param_u32("n");

            b.if_lt_u32(bid.clone(), batch_count_reg, |b| {
                let matrices_ptr = b.load_param_u64("matrices_ptr");

                let n2 = b.mul_lo_u32(n_reg.clone(), n_reg.clone());
                let mat_offset = b.mul_lo_u32(bid, n2);
                let _mat_base = b.byte_offset_addr(matrices_ptr, mat_offset, T::size_u32());

                // Cholesky in shared memory:
                // For each column k = 0..n:
                //   1. A[k,k] = sqrt(A[k,k]) (thread 0)
                //   2. A[i,k] /= A[k,k] for i > k (parallel)
                //   3. A[i,j] -= A[i,k] * A[j,k] for i >= j > k (parallel)

                let _ = (tid, float_ty, n_reg);
            });

            b.ret();
        })
        .build()?;

    Ok(ptx)
}

/// Emits PTX for batched linear solve (apply LU factors to RHS).
///
/// Each thread block solves one system: applies pivots to B, then performs
/// forward substitution (L) and backward substitution (U).
fn emit_batched_solve<T: GpuFloat>(sm: SmVersion, n: usize, nrhs: usize) -> SolverResult<String> {
    let name = batched_solve_name::<T>(n, nrhs);
    let float_ty = T::PTX_TYPE;

    let ptx = KernelBuilder::new(&name)
        .target(sm)
        .max_threads_per_block(SOLVER_BLOCK_SIZE)
        .param("lu_ptr", PtxType::U64)
        .param("b_ptr", PtxType::U64)
        .param("pivots_ptr", PtxType::U64)
        .param("n", PtxType::U32)
        .param("nrhs", PtxType::U32)
        .param("batch_count", PtxType::U32)
        .body(move |b| {
            let bid = b.block_id_x();
            let tid = b.thread_id_x();
            let batch_count_reg = b.load_param_u32("batch_count");
            let n_reg = b.load_param_u32("n");
            let nrhs_reg = b.load_param_u32("nrhs");

            b.if_lt_u32(bid.clone(), batch_count_reg, |b| {
                let lu_ptr = b.load_param_u64("lu_ptr");
                let b_ptr = b.load_param_u64("b_ptr");
                let pivots_ptr = b.load_param_u64("pivots_ptr");

                // LU matrix offset.
                let n2 = b.mul_lo_u32(n_reg.clone(), n_reg.clone());
                let lu_offset = b.mul_lo_u32(bid.clone(), n2);
                let _lu_base = b.byte_offset_addr(lu_ptr, lu_offset, T::size_u32());

                // B matrix offset.
                let b_stride = b.mul_lo_u32(n_reg.clone(), nrhs_reg);
                let b_offset = b.mul_lo_u32(bid.clone(), b_stride);
                let _b_base = b.byte_offset_addr(b_ptr, b_offset, T::size_u32());

                // Pivot offset.
                let piv_offset = b.mul_lo_u32(bid, n_reg);
                let _piv_base = b.byte_offset_addr(pivots_ptr, piv_offset, 4u32);

                // Solve steps:
                // 1. Apply pivots to B.
                // 2. Forward substitution: L * Y = P * B.
                // 3. Backward substitution: U * X = Y.

                let _ = (tid, float_ty);
            });

            b.ret();
        })
        .build()?;

    Ok(ptx)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn batch_algorithm_equality() {
        assert_eq!(BatchAlgorithm::Lu, BatchAlgorithm::Lu);
        assert_ne!(BatchAlgorithm::Lu, BatchAlgorithm::Qr);
        assert_ne!(BatchAlgorithm::Qr, BatchAlgorithm::Cholesky);
    }

    #[test]
    fn batch_config_construction() {
        let config = BatchConfig {
            matrix_size: 16,
            batch_count: 1000,
            algorithm: BatchAlgorithm::Lu,
        };
        assert_eq!(config.matrix_size, 16);
        assert_eq!(config.batch_count, 1000);
        assert_eq!(config.algorithm, BatchAlgorithm::Lu);
    }

    #[test]
    fn batched_result_construction() {
        let result = BatchedResult { failed_count: 0 };
        assert_eq!(result.failed_count, 0);

        let result2 = BatchedResult { failed_count: 5 };
        assert_eq!(result2.failed_count, 5);
    }

    #[test]
    fn matrices_per_block_small() {
        // Small matrices should pack multiple per block.
        assert_eq!(matrices_per_block(4), SMALL_MATRICES_PER_BLOCK);
        assert_eq!(matrices_per_block(8), SMALL_MATRICES_PER_BLOCK);
        assert_eq!(matrices_per_block(16), SMALL_MATRICES_PER_BLOCK);
    }

    #[test]
    fn matrices_per_block_large() {
        // Larger matrices get one per block.
        assert_eq!(matrices_per_block(32), 1);
        assert_eq!(matrices_per_block(64), 1);
    }

    #[test]
    fn compute_block_size_values() {
        // Small matrices.
        let bs_small = compute_block_size(8);
        assert!(bs_small <= SOLVER_BLOCK_SIZE);
        assert!(bs_small >= 32);

        // Medium matrices.
        let bs_med = compute_block_size(32);
        assert_eq!(bs_med, 32);

        // Large matrices.
        let bs_large = compute_block_size(64);
        assert_eq!(bs_large, 64);
    }

    #[test]
    fn compute_grid_size_values() {
        // 100 matrices, small (multiple per block).
        let grid = compute_grid_size(100, 8);
        assert_eq!(grid, 25); // 100 / 4 = 25

        // 100 matrices, large (one per block).
        let grid = compute_grid_size(100, 32);
        assert_eq!(grid, 100);

        // Non-divisible batch count.
        let grid = compute_grid_size(101, 8);
        assert_eq!(grid, 26); // ceil(101 / 4)
    }

    #[test]
    fn batched_lu_name_format() {
        let name = batched_lu_name::<f32>(16);
        assert!(name.contains("f32"));
        assert!(name.contains("16"));
    }

    #[test]
    fn batched_qr_name_format() {
        let name = batched_qr_name::<f64>(32, 16);
        assert!(name.contains("f64"));
        assert!(name.contains("32x16"));
    }

    #[test]
    fn batched_cholesky_name_format() {
        let name = batched_cholesky_name::<f32>(64);
        assert!(name.contains("f32"));
        assert!(name.contains("64"));
    }

    #[test]
    fn batched_solve_name_format() {
        let name = batched_solve_name::<f64>(16, 4);
        assert!(name.contains("f64"));
        assert!(name.contains("16"));
        assert!(name.contains("4"));
    }

    #[test]
    fn max_batch_matrix_size_reasonable() {
        let max_size = MAX_BATCH_MATRIX_SIZE;
        assert!(max_size >= 32);
        assert!(max_size <= 128);
    }

    #[test]
    fn small_matrix_threshold_consistent() {
        let threshold = SMALL_MATRIX_THRESHOLD;
        let per_block = SMALL_MATRICES_PER_BLOCK;
        assert!(threshold <= 32);
        assert!(per_block >= 1);
        assert!(per_block <= 16);
    }
}