kryst 3.2.1 - Docs.rs

#![cfg(feature = "backend-faer")]

//! HYPRE-inspired ILU factorization (canonical implementation).
//!
//! This module provides Kryst's canonical ILU engine built for `faer::Mat<f64>`.
//! The implementation targets real-valued matrices and exposes the same knobs you
//! expect from the HYPRE ILU family (tolerances, pivot policies, Schur handling, etc.).
//!
//! # ILU variants
//! - `IluType::ILU0`: classical ILU(0) with no fill-in beyond the original pattern.
//! - `IluType::MILU0`: modified ILU(0) that folds dropped contributions into the diagonal
//!   to keep row sums approximately constant.
//! - `IluType::ILUK`: level-of-fill ILU(k); fill is controlled via
//!   [`IluConfig::level_of_fill`].
//! - `IluType::ILUT`: threshold-based ILU with dropping and optional per-row fill limiting
//!   via [`IluConfig::drop_tolerance`] and [`IluConfig::max_fill_per_row`].
//! - `BlockJacobi` / `GmresIluk` / `GmresIlut`: HYPRE-style aliases. These builders defer to
//!   [`Ilup`] or [`Ilut`] for performance when it makes sense while keeping the same configuration
//!   story. They currently dispatch to the unified [`Ilu`] implementation unless
//!   [`Ilu::create_specialized`] routes `IluType::ILUT` to the simpler [`crate::preconditioner::ilut::Ilut`].
//!
//! # Real vs complex
//! `Ilu` currently factorizes only real-valued matrices (`faer::Mat<f64>`). The factorization
//! and triangular solves are all done in real arithmetic. Complex-valued Krylov solvers should
//! use simpler preconditioners (Jacobi, diagonal scaling) or ILU variants with explicit
//! [`KPreconditioner`] bridges (`Ilup`, `Ilut`, `Ilutp`), which internally factorize in real
//! arithmetic and map complex vectors through a bridge.
//!
//! # Parallel execution
//! [`IluConfig::enable_parallel_factorization`] and
//! [`IluConfig::enable_parallel_triangular_solve`] control optional parallel behavior when the
//! `rayon` feature is enabled. Factorization remains mostly sequential today and the flag is held
//! for future ParILU-style experiments, while triangular solves currently level-schedule the
//! substitutions and only expose true concurrency when built with `rayon`.
//! The [`IluConfig::parallel_chunk_size`] parameter caps the number of rows each task touches; it
//! is mainly a tuning knob for these experimental paths.
//!
//! # References
//! - HYPRE ParILU implementation
//! - Saad, Y. (2003). *Iterative Methods for Sparse Linear Systems*
//! - Li, X. (2005). *Iterative Methods for Large Sparse Linear Systems*
//! See `examples/poisson_spd_ilu0_vs_jacobi.rs` for a Jacobi vs ILU(0) comparison on a 1D Poisson problem.
//! See `examples/mpi_poisson_block_jacobi_ilu.rs` for a distributed block-Jacobi + ILU(0) walk-through.

#[cfg(feature = "complex")]
use crate::algebra::bridge::{BridgeScratch, copy_real_into_scalar, copy_scalar_to_real_in};
use crate::algebra::scalar::KrystScalar;
#[cfg(feature = "complex")]
use crate::algebra::scalar::S as GlobalScalar;
use crate::error::KError;
use crate::matrix::sparse::CsrMatrix;
#[cfg(feature = "complex")]
use crate::ops::kpc::KPreconditioner;
use crate::preconditioner::LocalPreconditioner;
use crate::preconditioner::stats::{ParIluHistory, ParIluIterSample};
use crate::preconditioner::{PcSide, legacy::Preconditioner, pivot::*, tri_solve::TriangularSolve};
use crate::utils::conditioning::{apply_dense_transforms, ConditioningOptions};
use crate::utils::metrics::{Counters, SolveTimer};
use crate::utils::monitor::{Event, Monitor};
use faer::Mat;
#[cfg(feature = "rayon")]
use std::sync::Arc;
use std::sync::Mutex;

#[cfg(feature = "rayon")]
use rayon::prelude::*;

#[cfg(feature = "logging")]
use log::{debug, info, trace, warn};

// ILU is restricted to real scalars.
type S = f64;
type Real = f64;

/// HYPRE-inspired ILU types
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum IluType {
    /// ILU(0) - Zero fill-in factorization
    ILU0 = 0,
    /// ILU(k) - Level-based fill-in factorization  
    ILUK = 1,
    /// ILUT - Threshold-based factorization
    ILUT = 2,
    /// Modified ILU(0) for better stability
    MILU0 = 3,
    /// Block Jacobi with ILU(0)
    BlockJacobi = 10,
    /// GMRES with ILU(k) preconditioning  
    GmresIluk = 20,
    /// GMRES with ILUT preconditioning
    GmresIlut = 21,
}

/// HYPRE-inspired reordering strategies
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum ReorderingType {
    /// No reordering
    None = 0,
    /// Reverse Cuthill-McKee
    RCM = 1,
    /// Approximate Minimum Degree
    AMD = 2,
    /// Natural ordering
    Natural = 3,
}

/// Enhanced triangular solve options
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TriSolveType {
    /// Exact triangular solve
    Exact = 0,
    /// Iterative triangular solve with Jacobi
    Jacobi = 1,
    /// Iterative triangular solve with Gauss-Seidel
    GaussSeidel = 2,
}

/// HYPRE-inspired ILU configuration.
///
/// When `ilu_type` is [`IluType::ILUT`], the builder performs a unified threshold-based ILU
/// factorization in [`Ilu`]. `Ilu::create_specialized` may instead dispatch that variant to the
/// simpler [`crate::preconditioner::ilut::Ilut`] implementation for performance, but the unified
/// `Ilu` stays feature-complete (pivot policy, iterative solves, logging, etc.).
///
/// ```no_run
/// # #[cfg(feature = "backend-faer")]
/// # {
/// use kryst::preconditioner::ilu::{Ilu, IluConfig, IluType, TriSolveType};
///
/// let mut cfg = IluConfig::default();
/// cfg.ilu_type = IluType::ILUT;
/// cfg.drop_tolerance = 1e-4;
/// cfg.max_fill_per_row = 50;
/// cfg.triangular_solve = TriSolveType::Exact;
///
/// let _ilu = Ilu::new_with_config(cfg).unwrap();
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct IluConfig {
    /// ILU factorization type (HYPRE: ilu_type)
    pub ilu_type: IluType,
    /// Level of fill for ILU(k) (HYPRE: lfil)
    pub level_of_fill: usize,
    /// Maximum nonzeros per row (HYPRE: maxRowNnz)
    pub max_fill_per_row: usize,
    /// Drop tolerance for ILUT (HYPRE: droptol[0])
    pub drop_tolerance: Real,
    /// Drop tolerance for off-diagonal blocks (HYPRE: droptol[1])
    pub offdiag_drop_tolerance: Real,
    /// Drop tolerance for Schur complement (HYPRE: droptol[2])
    pub schur_drop_tolerance: Real,
    /// Reordering strategy (HYPRE: reordering_type)
    pub reordering_type: ReorderingType,
    /// Triangular solve type (HYPRE: tri_solve)
    pub triangular_solve: TriSolveType,
    /// Lower triangular Jacobi iterations (HYPRE: lower_jacobi_iters)
    pub lower_jacobi_iters: usize,
    /// Upper triangular Jacobi iterations (HYPRE: upper_jacobi_iters)
    pub upper_jacobi_iters: usize,
    /// Tolerance for iterative solve (HYPRE: tol)
    pub tolerance: Real,
    /// Maximum iterations for iterative solve (HYPRE: max_iter)
    pub max_iterations: usize,
    /// Logging level (HYPRE: logging)
    pub logging_level: usize,
    /// Print level for diagnostics (HYPRE: print_level)
    pub print_level: usize,
    /// Enable IEEE safety checks
    pub ieee_checks: bool,
    /// Enable workspace optimization
    pub optimize_workspace: bool,
    /// Pivot handling policy (see [`crate::preconditioner::pivot`] for details)
    pub pivot_policy: PivotPolicy,
    /// Enable parallel factorization (requires the `rayon` feature).
    ///
    /// Factorization remains mostly sequential today; this flag is reserved for future ParILU-style
    /// implementations and experimental workspaces.
    pub enable_parallel_factorization: bool,
    /// Enable parallel triangular solves (requires the `rayon` feature).
    ///
    /// When enabled, level-scheduled forward/backward substitutions are prepared. The current
    /// implementation still walks the levels sequentially but is structured to expose real
    /// concurrency when built with `rayon`.
    pub enable_parallel_triangular_solve: bool,
    /// Chunk size for parallel operations.
    ///
    /// Controls how many rows each task processes in the parallel factorization/triangular solve
    /// paths. It mostly serves as a tuning knob for the experimental code paths today.
    pub parallel_chunk_size: usize,
    /// Enable distributed memory support (requires MPI)
    pub enable_distributed: bool,
    /// Enable ParILU refinement after the initial ILU factorization
    pub parilu_enabled: bool,
    /// Maximum ParILU sweeps
    pub parilu_max_iters: usize,
    /// Minimum ParILU sweeps before early-exit checks
    pub parilu_min_iters: usize,
    /// Convergence tolerance for ParILU residual
    pub parilu_tol: Real,
    /// Relaxation factor for ParILU fixed-point updates
    pub parilu_omega: Real,
    /// Optional conditioning transforms applied before factorization.
    pub conditioning: ConditioningOptions,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum ParFactorizationMode {
    Serial,
    Block,
    ParIlu,
}

impl IluConfig {
    fn par_factor_mode(&self) -> ParFactorizationMode {
        if !self.enable_parallel_factorization {
            ParFactorizationMode::Serial
        } else if self.parilu_enabled {
            ParFactorizationMode::ParIlu
        } else {
            ParFactorizationMode::Block
        }
    }
}

impl Default for IluConfig {
    /// HYPRE-inspired robust defaults with parallel support
    fn default() -> Self {
        Self {
            ilu_type: IluType::ILU0,
            level_of_fill: 0,                      // HYPRE default for ILU(0)
            max_fill_per_row: 0,                   // HYPRE default: unlimited
            drop_tolerance: 1e-4,                  // HYPRE conservative default
            offdiag_drop_tolerance: 1e-4,          // HYPRE default
            schur_drop_tolerance: 1e-4,            // HYPRE default
            reordering_type: ReorderingType::None, // HYPRE default
            triangular_solve: TriSolveType::Exact, // HYPRE default
            lower_jacobi_iters: 1,                 // HYPRE default
            upper_jacobi_iters: 1,                 // HYPRE default
            tolerance: 1e-6,                       // HYPRE default
            max_iterations: 1,                     // HYPRE default for direct solve
            logging_level: 0,                      // No logging by default
            print_level: 0,                        // No printing by default
            ieee_checks: true,                     // Safety first
            optimize_workspace: true,              // Performance optimization
            pivot_policy: PivotPolicy::default(),
            enable_parallel_factorization: false, // Conservative default
            enable_parallel_triangular_solve: false, // Conservative default
            parallel_chunk_size: 64,              // Reasonable chunk size for cache efficiency
            enable_distributed: false,            // Conservative default
            parilu_enabled: false,
            parilu_max_iters: 0,
            parilu_min_iters: 0,
            parilu_tol: 1e-2,
            parilu_omega: 1.0,
            conditioning: ConditioningOptions::default(),
        }
    }
}

#[cfg(feature = "logging")]
fn print_ilu_banner(cfg: &IluConfig) {
    if cfg.logging_level == 0 {
        return;
    }
    info!("ILU Setup:");
    info!("  kind                 : {:?}", cfg.ilu_type);
    info!("  reordering           : {:?}", cfg.reordering_type);
    let tri = match cfg.triangular_solve {
        TriSolveType::Exact => "Exact".to_string(),
        TriSolveType::Jacobi => format!(
            "Jacobi (L:{} U:{})",
            cfg.lower_jacobi_iters, cfg.upper_jacobi_iters
        ),
        TriSolveType::GaussSeidel => "GaussSeidel".to_string(),
    };
    info!("  triangular solve     : {tri}");
    info!(
        "  iterative setup      : tol={:.2e}, max_iter={}",
        cfg.tolerance, cfg.max_iterations
    );
    info!(
        "  exec                 : distributed={}, par_factorization={}, par_trisolve={}",
        cfg.enable_distributed,
        cfg.enable_parallel_factorization,
        cfg.enable_parallel_triangular_solve
    );
    info!("  pivot                : {:?}", cfg.pivot_policy);
    info!(
        "  parilu               : enabled={}, max_iter={}, min_iter={}, tol={:.2e}, omega={:.2}",
        cfg.parilu_enabled,
        cfg.parilu_max_iters,
        cfg.parilu_min_iters,
        cfg.parilu_tol,
        cfg.parilu_omega
    );
}

/// HYPRE-inspired ILU builder for advanced configuration
pub struct IluBuilder {
    config: IluConfig,
}

impl IluBuilder {
    /// Create new builder with HYPRE defaults
    pub fn new() -> Self {
        Self {
            config: IluConfig::default(),
        }
    }

    /// Set ILU type (HYPRE: ilu_type)
    pub fn ilu_type(mut self, ilu_type: IluType) -> Self {
        self.config.ilu_type = ilu_type;
        self
    }

    /// Set level of fill for ILU(k) (HYPRE: lfil)
    pub fn level_of_fill(mut self, level: usize) -> Self {
        self.config.level_of_fill = level;
        self
    }

    /// Set maximum fill per row (HYPRE: maxRowNnz)
    pub fn max_fill_per_row(mut self, max_fill: usize) -> Self {
        self.config.max_fill_per_row = max_fill;
        self
    }

    /// Set drop tolerance for ILUT (HYPRE: droptol)
    pub fn drop_tolerance(mut self, tol: Real) -> Self {
        self.config.drop_tolerance = tol;
        self
    }

    /// Set reordering strategy (HYPRE: reordering_type)
    pub fn enable_reordering(mut self, reordering: ReorderingType) -> Self {
        self.config.reordering_type = reordering;
        self
    }

    /// Set triangular solve type (HYPRE: tri_solve)
    pub fn triangular_solve(mut self, solve_type: TriSolveType) -> Self {
        self.config.triangular_solve = solve_type;
        self
    }

    /// Set Jacobi iterations for triangular solves (HYPRE: lower/upper_jacobi_iters)
    pub fn jacobi_iterations(mut self, lower: usize, upper: usize) -> Self {
        self.config.lower_jacobi_iters = lower;
        self.config.upper_jacobi_iters = upper;
        self
    }

    /// Enable comprehensive logging (HYPRE: logging)
    pub fn enable_logging(mut self) -> Self {
        self.config.logging_level = 1;
        self
    }

    /// Set detailed logging level (HYPRE: logging)
    pub fn logging_level(mut self, level: usize) -> Self {
        self.config.logging_level = level;
        self
    }

    /// Enable printing (HYPRE: print_level)
    pub fn enable_printing(mut self) -> Self {
        self.config.print_level = 1;
        self
    }

    /// Set print level (HYPRE: print_level)
    pub fn print_level(mut self, level: usize) -> Self {
        self.config.print_level = level;
        self
    }

    /// Set pivot handling policy
    pub fn pivot_policy(mut self, policy: PivotPolicy) -> Self {
        self.config.pivot_policy = policy;
        self
    }

    /// Enable parallel factorization (requires rayon feature)
    pub fn enable_parallel_factorization(mut self) -> Self {
        self.config.enable_parallel_factorization = true;
        self
    }

    /// Enable parallel triangular solves (requires rayon feature)
    pub fn enable_parallel_triangular_solve(mut self) -> Self {
        self.config.enable_parallel_triangular_solve = true;
        self
    }

    /// Set chunk size for parallel operations
    pub fn parallel_chunk_size(mut self, chunk_size: usize) -> Self {
        self.config.parallel_chunk_size = chunk_size;
        self
    }

    /// Enable all parallel features
    pub fn enable_parallel(mut self) -> Self {
        self.config.enable_parallel_factorization = true;
        self.config.enable_parallel_triangular_solve = true;
        self
    }

    /// Enable ParILU refinement with explicit iteration controls
    pub fn enable_parilu(mut self, max_iters: usize, tol: Real, omega: Real) -> Self {
        self.config.parilu_enabled = true;
        self.config.parilu_max_iters = max_iters;
        self.config.parilu_tol = tol;
        self.config.parilu_omega = omega;
        self
    }

    /// Set the minimum ParILU sweeps before checking convergence
    pub fn parilu_min_iters(mut self, min_iters: usize) -> Self {
        self.config.parilu_min_iters = min_iters;
        self
    }

    /// Enable distributed memory support (requires MPI)
    pub fn enable_distributed(mut self) -> Self {
        self.config.enable_distributed = true;
        self
    }

    /// Build ILU preconditioner with configuration
    pub fn build(self) -> Result<Ilu, KError> {
        Ilu::new_with_config(self.config)
    }
}

impl Default for IluBuilder {
    fn default() -> Self {
        Self::new()
    }
}

/// HYPRE-inspired comprehensive ILU preconditioner with sparse storage
///
/// **Note:** ILU is currently restricted to real (`f64`) matrices only.
/// Complex-valued problems should use simpler preconditioners (e.g., Jacobi, diagonal scaling).
/// Complex problems currently require using different preconditioner types or the bridge-based
/// ILU variants (`Ilup`, `Ilut`, `Ilutp`).
/// This type implements [`LocalPreconditioner`] and is intended to be wrapped by
/// an MPI-aware distributed preconditioner; it performs no communication on its own.
pub struct Ilu {
    /// Configuration parameters
    config: IluConfig,
    /// Lower triangular factor in CSR format (unit diagonal)
    l: CsrMatrix<f64>,
    /// Upper triangular factor in CSR format
    u: CsrMatrix<f64>,
    /// Cached inverse of U's diagonal entries for fast solves
    inv_diag_u: Vec<f64>,
    /// Permutation arrays (HYPRE: perm, qperm)
    #[allow(dead_code)]
    row_perm: Vec<usize>,
    #[allow(dead_code)]
    col_perm: Vec<usize>,
    /// Consolidated preallocated workspace vectors for all operations
    workspace: IluWorkspace,
    #[cfg(feature = "rayon")]
    /// Level scheduling for lower triangular solves
    levels_l: Levels,
    #[cfg(feature = "rayon")]
    /// Level scheduling for upper triangular solves
    levels_u: Levels,
    /// Setup complexity metrics (HYPRE: operator_complexity)
    setup_complexity: f64,
    /// Factorization statistics
    nnz_l: usize,
    nnz_u: usize,
    num_zero_pivots: usize,
    /// Pivot handling statistics
    pivot_stats: PivotStats,
    /// Global scaling from A's diagonal
    max_diag_a: f64,
    /// Row-wise infinity norm of A
    row_inf_a: Vec<f64>,
    /// Row-wise Gershgorin estimate of A
    row_gersh_a: Vec<f64>,
    /// Running maximum of |U_kk|
    running_max_u: f64,
    /// Performance timing
    setup_time: f64,
    solve_ctrs: Counters,
    /// Optional ParILU iteration history
    history: Option<ParIluHistory>,
    /// Optional event monitor
    monitor: Option<Box<dyn Monitor>>,
}

/// Consolidated workspace for all ILU operations to minimize allocations
#[derive(Debug)]
pub struct IluWorkspace {
    /// Scratch buffer for triangular solves (sized once in setup)
    solve_buf: Mutex<Vec<f64>>,
    /// Secondary workspace for complex operations
    temp2: Mutex<Vec<f64>>,
    /// Workspace for level scheduling in parallel triangular solves
    levels: Mutex<Vec<usize>>,
    /// Workspace for sparse pattern operations
    pattern_work: Mutex<Vec<bool>>,
    /// Current workspace size
    size: usize,
}

impl IluWorkspace {
    /// Create new workspace with given size
    pub fn new(size: usize) -> Self {
        Self {
            solve_buf: Mutex::new(vec![0.0; size]),
            temp2: Mutex::new(vec![0.0; size]),
            levels: Mutex::new(vec![0; size]),
            pattern_work: Mutex::new(vec![false; size]),
            size,
        }
    }

    /// Resize workspace if needed (avoids reallocation when possible)
    pub fn ensure_size(&mut self, new_size: usize) {
        if new_size > self.size {
            self.solve_buf.lock().unwrap().resize(new_size, 0.0);
            self.temp2.lock().unwrap().resize(new_size, 0.0);
            self.levels.lock().unwrap().resize(new_size, 0);
            self.pattern_work.lock().unwrap().resize(new_size, false);
            self.size = new_size;
        }
    }

    /// Clear workspace (without deallocation)
    pub fn clear(&self) {
        for x in self.solve_buf.lock().unwrap().iter_mut() {
            *x = 0.0;
        }
        for x in self.temp2.lock().unwrap().iter_mut() {
            *x = 0.0;
        }
        for x in self.levels.lock().unwrap().iter_mut() {
            *x = 0;
        }
        for x in self.pattern_work.lock().unwrap().iter_mut() {
            *x = false;
        }
    }

    /// Borrow the solve buffer sized in `setup()`.
    #[inline]
    pub fn borrow_solve_buf(&self, n: usize) -> std::sync::MutexGuard<'_, Vec<f64>> {
        debug_assert!(
            self.size >= n,
            "workspace not sized; call ensure_size in setup()"
        );
        self.solve_buf.lock().unwrap()
    }
}

#[cfg(feature = "rayon")]
struct BlockFactorResult {
    l: CsrMatrix<f64>,
    u: CsrMatrix<f64>,
    pivot_stats: PivotStats,
    running_max: f64,
    zero_pivots: usize,
}

#[cfg(feature = "rayon")]
#[derive(Clone, Debug, Default)]
struct Levels {
    /// Rows grouped by level
    buckets: Vec<Vec<usize>>,
    /// Maximum level
    max_level: u32,
}

#[cfg(feature = "rayon")]
fn build_levels_lower(l: &CsrMatrix<f64>) -> Levels {
    let n = l.nrows();
    let mut lev = vec![0u32; n];
    let mut maxl = 0u32;
    for i in 0..n {
        let (cols, _vals) = l.row(i);
        let mut li = 0u32;
        for &j in cols {
            if j >= i {
                continue;
            }
            li = li.max(lev[j] + 1);
        }
        lev[i] = li;
        maxl = maxl.max(li);
    }
    let mut buckets = vec![Vec::new(); (maxl as usize) + 1];
    for (i, &l) in lev.iter().enumerate() {
        buckets[l as usize].push(i);
    }
    Levels {
        buckets,
        max_level: maxl,
    }
}

#[cfg(feature = "rayon")]
fn build_levels_upper(u: &CsrMatrix<f64>) -> Levels {
    let n = u.nrows();
    let mut lev = vec![0u32; n];
    let mut maxl = 0u32;
    for i in (0..n).rev() {
        let (cols, _vals) = u.row(i);
        let mut li = 0u32;
        for &j in cols {
            if j <= i {
                continue;
            }
            li = li.max(lev[j] + 1);
        }
        lev[i] = li;
        maxl = maxl.max(li);
    }
    let mut buckets = vec![Vec::new(); (maxl as usize) + 1];
    for (i, &l) in lev.iter().enumerate() {
        buckets[l as usize].push(i);
    }
    Levels {
        buckets,
        max_level: maxl,
    }
}

impl Ilu {
    /// Create new ILU with HYPRE defaults
    pub fn new() -> Self {
        Self::new_with_config(IluConfig::default()).unwrap()
    }

    /// Create ILU with comprehensive configuration
    pub fn new_with_config(config: IluConfig) -> Result<Self, KError> {
        Self::validate_config(&config)?;

        #[cfg(feature = "logging")]
        if config.logging_level > 0 {
            info!(
                "ILU Setup: Creating {:?} factorization with HYPRE-inspired configuration",
                config.ilu_type
            );
            debug!(
                "ILU Config: fill_level={}, drop_tol={:.2e}, reordering={:?}",
                config.level_of_fill, config.drop_tolerance, config.reordering_type
            );
        }

        Ok(Self {
            config,
            l: CsrMatrix::from_csr(0, 0, vec![0], Vec::new(), Vec::new()),
            u: CsrMatrix::from_csr(0, 0, vec![0], Vec::new(), Vec::new()),
            inv_diag_u: Vec::new(),
            row_perm: Vec::new(),
            col_perm: Vec::new(),
            workspace: IluWorkspace::new(0),
            #[cfg(feature = "rayon")]
            levels_l: Levels::default(),
            #[cfg(feature = "rayon")]
            levels_u: Levels::default(),
            setup_complexity: 0.0,
            nnz_l: 0,
            nnz_u: 0,
            num_zero_pivots: 0,
            pivot_stats: PivotStats::default(),
            max_diag_a: Real::default(),
            row_inf_a: Vec::new(),
            row_gersh_a: Vec::new(),
            running_max_u: Real::default(),
            setup_time: 0.0,
            solve_ctrs: Counters::new(),
            history: None,
            monitor: None,
        })
    }

    /// HYPRE-inspired configuration validation
    fn validate_config(config: &IluConfig) -> Result<(), KError> {
        if config.drop_tolerance < 0.0 {
            return Err(KError::InvalidInput(
                "drop_tolerance must be >= 0".to_string(),
            ));
        }

        if config.enable_parallel_triangular_solve && config.parallel_chunk_size == 0 {
            return Err(KError::InvalidInput(
                "parallel_chunk_size must be > 0 when parallel triangular solve is enabled"
                    .to_string(),
            ));
        }

        if config.tolerance <= 0.0 {
            return Err(KError::InvalidInput("tolerance must be > 0".to_string()));
        }

        if config.parilu_omega <= 0.0 {
            return Err(KError::InvalidInput(
                "parilu_omega must be > 0 (relaxation factor)".to_string(),
            ));
        }

        if config.parilu_min_iters > config.parilu_max_iters {
            return Err(KError::InvalidInput(
                "parilu_min_iters cannot exceed parilu_max_iters".to_string(),
            ));
        }

        Ok(())
    }

    fn allow_parallel_factorization(&self, n: usize) -> bool {
        if !self.config.enable_parallel_factorization {
            return false;
        }
        #[cfg(feature = "rayon")]
        {
            if crate::algebra::parallel_cfg::force_serial() {
                return false;
            }
            if crate::parallel::threads::current_rayon_threads() <= 1 {
                return false;
            }
            let chunk_size = self.config.parallel_chunk_size.max(1);
            if chunk_size == 1 {
                return true;
            }
            let tune = crate::algebra::parallel_cfg::parallel_tune();
            return n >= tune.min_rows_ilu_factorization;
        }
        #[cfg(not(feature = "rayon"))]
        {
            true
        }
    }

    fn allow_parallel_triangular_solve(&self, n: usize) -> bool {
        if !self.config.enable_parallel_triangular_solve {
            return false;
        }
        #[cfg(feature = "rayon")]
        {
            if crate::algebra::parallel_cfg::force_serial() {
                return false;
            }
            if crate::parallel::threads::current_rayon_threads() <= 1 {
                return false;
            }
            let tune = crate::algebra::parallel_cfg::parallel_tune();
            return n >= tune.min_rows_ilu_triangular;
        }
        #[cfg(not(feature = "rayon"))]
        {
            false
        }
    }

    /// HYPRE-inspired IEEE safety checks
    fn check_ieee_values(matrix: &Mat<f64>) -> Result<(), KError> {
        for i in 0..matrix.nrows() {
            for j in 0..matrix.ncols() {
                let val = matrix[(i, j)];
                if val.is_nan() {
                    return Err(KError::InvalidInput(format!(
                        "NaN detected in matrix at position ({i}, {j})"
                    )));
                }
                if val.is_infinite() {
                    return Err(KError::InvalidInput(format!(
                        "Infinity detected in matrix at position ({i}, {j})"
                    )));
                }
            }
        }
        Ok(())
    }

    /// HYPRE-inspired matrix validation with enhanced analysis
    fn validate_matrix(matrix: &Mat<f64>) -> Result<(), KError> {
        if matrix.nrows() == 0 || matrix.ncols() == 0 {
            return Err(KError::InvalidInput("Matrix cannot be empty".to_string()));
        }

        if matrix.nrows() != matrix.ncols() {
            return Err(KError::InvalidInput(
                "ILU requires square matrices".to_string(),
            ));
        }

        Ok(())
    }

    /// Calculate setup complexity (HYPRE: operator_complexity)
    fn calculate_complexity(&self, original_nnz: usize) -> f64 {
        let total_nnz = self.nnz_l + self.nnz_u;
        if original_nnz > 0 {
            total_nnz as f64 / original_nnz as f64
        } else {
            0.0
        }
    }

    fn stabilize_pivot_value(
        pivot: &mut f64,
        row: usize,
        matrix: &Mat<f64>,
        policy: &PivotPolicy,
        max_diag_a: f64,
        row_inf_a: &[f64],
        row_gersh_a: &[f64],
        stats: &mut PivotStats,
        running_max: &mut f64,
        zero_pivots: &mut usize,
    ) -> Result<(), KError> {
        let s_i = match policy.scale {
            PivotScale::MaxDiagA => max_diag_a,
            PivotScale::LocalDiagA => matrix[(row, row)].abs(),
            PivotScale::RowInfA => row_inf_a[row],
            PivotScale::RowGershgorin => row_gersh_a[row],
            PivotScale::RunningMaxU => *running_max,
        }
        .max(max_diag_a);

        if let Err(e) =
            stabilize_pivot_in_place(pivot, s_i, policy.tau, policy.sign, policy.mode, stats, row)
        {
            *zero_pivots += 1;
            return Err(e);
        }

        let abs = pivot.abs();
        if abs > *running_max {
            *running_max = abs;
        }
        Ok(())
    }

    /// Pivot stabilization using configurable policy
    fn handle_pivot(
        &mut self,
        pivot: &mut f64,
        row: usize,
        matrix: &Mat<f64>,
    ) -> Result<(), KError> {
        let policy = &self.config.pivot_policy;

        // determine scaling value and guard against vanishing floors by
        // clamping to the global maximum diagonal magnitude
        Self::stabilize_pivot_value(
            pivot,
            row,
            matrix,
            policy,
            self.max_diag_a,
            &self.row_inf_a,
            &self.row_gersh_a,
            &mut self.pivot_stats,
            &mut self.running_max_u,
            &mut self.num_zero_pivots,
        )?;

        Ok(())
    }

    fn partition_rows(n: usize, block_size: usize) -> Vec<std::ops::Range<usize>> {
        let block_size = block_size.max(1);
        let mut blocks = Vec::new();
        let mut start = 0;
        while start < n {
            let end = (start + block_size).min(n);
            blocks.push(start..end);
            start = end;
        }
        blocks
    }

    fn extract_block_dense(matrix: &Mat<f64>, rows: std::ops::Range<usize>) -> Mat<f64> {
        let m = rows.len();
        let mut block = Mat::zeros(m, m);
        for local_i in 0..m {
            let global_i = rows.start + local_i;
            for local_j in 0..m {
                let global_j = rows.start + local_j;
                block[(local_i, local_j)] = matrix[(global_i, global_j)];
            }
        }
        block
    }

    #[cfg(feature = "rayon")]
    fn factor_block(
        matrix: &Mat<f64>,
        rows: std::ops::Range<usize>,
        policy: &PivotPolicy,
        max_diag: f64,
        row_inf: &[f64],
        row_gersh: &[f64],
    ) -> Result<BlockFactorResult, KError> {
        let block = Self::extract_block_dense(matrix, rows.clone());
        let drop_tol: f64 = 1e-15;
        let mut l = CsrMatrix::from_dense(&block, drop_tol)?;
        let mut u = CsrMatrix::from_dense(&block, drop_tol)?;
        let size = block.nrows();

        for i in 0..size {
            for j in 0..size {
                if i > j {
                    Self::sparse_set(&mut u, i, j, 0.0);
                } else if i < j {
                    Self::sparse_set(&mut l, i, j, 0.0);
                } else {
                    Self::sparse_set(&mut l, i, i, 1.0);
                }
            }
        }

        let mut stats = PivotStats::default();
        let mut running_max: f64 = 0.0;
        let mut zero_pivots = 0usize;

        for k in 0..size {
            let mut pivot = Self::sparse_get(&u, k, k);
            Self::stabilize_pivot_value(
                &mut pivot,
                rows.start + k,
                matrix,
                policy,
                max_diag,
                row_inf,
                row_gersh,
                &mut stats,
                &mut running_max,
                &mut zero_pivots,
            )?;
            Self::sparse_set(&mut u, k, k, pivot);

            for i in (k + 1)..size {
                let l_ik = Self::sparse_get(&l, i, k);
                if l_ik != 0.0 {
                    let multiplier = l_ik / pivot;
                    Self::sparse_set(&mut l, i, k, multiplier);
                    for j in (k + 1)..size {
                        let u_kj = Self::sparse_get(&u, k, j);
                        if u_kj != 0.0 {
                            let global_i = rows.start + i;
                            let global_j = rows.start + j;
                            if matrix[(global_i, global_j)] != 0.0 {
                                let u_ij = Self::sparse_get(&u, i, j);
                                let new_val = u_ij - multiplier * u_kj;
                                Self::sparse_set(&mut u, i, j, new_val);
                            }
                        }
                    }
                }
            }
        }

        Ok(BlockFactorResult {
            l,
            u,
            pivot_stats: stats,
            running_max,
            zero_pivots,
        })
    }

    /// Helper: Get element from sparse matrix (returns zero if not present)
    fn sparse_get(matrix: &CsrMatrix<f64>, i: usize, j: usize) -> f64 {
        let (cols, vals) = matrix.row(i);
        match cols.binary_search(&j) {
            Ok(pos) => vals[pos],
            Err(_) => 0.0,
        }
    }

    /// Helper: Set element in sparse matrix without changing structure.
    ///
    /// This routine assumes the sparsity pattern already contains the
    /// entry `(i, j)`.  If the entry is absent, the call is a no-op.
    fn sparse_set(matrix: &mut CsrMatrix<f64>, i: usize, j: usize, value: f64) {
        let start = matrix.row_ptr()[i];
        let end = matrix.row_ptr()[i + 1];
        // Determine position of column j within the row while holding only
        // an immutable borrow.
        let mut pos_in_row = None;
        {
            let cols = &matrix.col_idx()[start..end];
            if let Ok(off) = cols.binary_search(&j) {
                pos_in_row = Some(start + off);
            }
        }
        if let Some(p) = pos_in_row {
            let values = matrix.values_mut();
            values[p] = value;
        }
    }

    /// Compute ILU(0) factorization with enhanced pivot handling and sparse storage
    fn compute_ilu0(&mut self, matrix: &Mat<f64>) -> Result<(), KError> {
        let n = matrix.nrows();

        // Convert input matrix to sparse CSR format for L and U factors
        let drop_tol: f64 = 1e-15;
        let mut l = CsrMatrix::from_dense(matrix, drop_tol)?;
        let mut u = CsrMatrix::from_dense(matrix, drop_tol)?;

        // Initialize L as lower triangular with unit diagonal, U as upper triangular
        for i in 0..n {
            for j in 0..n {
                if i > j {
                    // L gets lower triangular part
                    Self::sparse_set(&mut u, i, j, 0.0);
                } else if i < j {
                    // U gets upper triangular part
                    Self::sparse_set(&mut l, i, j, 0.0);
                } else {
                    // L has unit diagonal
                    Self::sparse_set(&mut l, i, i, 1.0);
                }
            }
        }

        // HYPRE-style ILU(0) factorization
        for k in 0..n {
            // Enhanced pivot handling
            let mut pivot = Self::sparse_get(&u, k, k);
            self.handle_pivot(&mut pivot, k, matrix)?;
            Self::sparse_set(&mut u, k, k, pivot);

            for i in (k + 1)..n {
                let l_ik = Self::sparse_get(&l, i, k);
                if l_ik != 0.0 {
                    let multiplier = l_ik / pivot;
                    Self::sparse_set(&mut l, i, k, multiplier);

                    for j in (k + 1)..n {
                        let u_kj = Self::sparse_get(&u, k, j);
                        if u_kj != 0.0 && matrix[(i, j)] != 0.0 {
                            let u_ij = Self::sparse_get(&u, i, j);
                            let new_val = u_ij - multiplier * u_kj;
                            Self::sparse_set(&mut u, i, j, new_val);
                        }
                    }
                }
            }
        }

        // Calculate sparsity metrics
        self.nnz_l = l.nnz();
        self.nnz_u = u.nnz();

        // Cache inverse diagonal of U for fast solves
        self.inv_diag_u = u.diagonal().into_iter().map(|v| 1.0 / v).collect();

        self.l = l;
        self.u = u;

        Ok(())
    }

    #[cfg(feature = "rayon")]
    fn compute_ilu0_block_parallel(&mut self, matrix: &Mat<f64>) -> Result<(), KError> {
        let n = matrix.nrows();
        let chunk_size = self.config.parallel_chunk_size.max(1);
        let blocks = Self::partition_rows(n, chunk_size);

        let row_inf = Arc::new(self.row_inf_a.clone());
        let row_gersh = Arc::new(self.row_gersh_a.clone());
        let pivot_policy = Arc::new(self.config.pivot_policy.clone());
        let max_diag = self.max_diag_a;

        let results = Arc::new(Mutex::new(Vec::with_capacity(blocks.len())));
        let first_error = Arc::new(Mutex::new(None::<KError>));

        rayon::scope(|scope| {
            for rows in blocks.iter().cloned() {
                let results = Arc::clone(&results);
                let first_error = Arc::clone(&first_error);
                let row_inf = Arc::clone(&row_inf);
                let row_gersh = Arc::clone(&row_gersh);
                let policy = Arc::clone(&pivot_policy);
                let matrix = matrix;
                scope.spawn(move |_| {
                    if first_error.lock().unwrap().is_some() {
                        return;
                    }
                    match Self::factor_block(
                        matrix,
                        rows.clone(),
                        policy.as_ref(),
                        max_diag,
                        row_inf.as_ref(),
                        row_gersh.as_ref(),
                    ) {
                        Ok(res) => {
                            results.lock().unwrap().push((rows, res));
                        }
                        Err(e) => {
                            let mut guard = first_error.lock().unwrap();
                            if guard.is_none() {
                                *guard = Some(e);
                            }
                        }
                    }
                });
            }
        });

        if let Some(err) = first_error.lock().unwrap().take() {
            return Err(err);
        }

        let mut block_results = {
            let mut guard = results.lock().unwrap();
            std::mem::take(&mut *guard)
        };

        block_results.sort_by_key(|(range, _)| range.start);

        let mut merged_stats = PivotStats::default();
        let mut running_max: f64 = 0.0;
        let mut zero_pivots = 0usize;
        for (_, result) in block_results.iter() {
            merged_stats.num_floors += result.pivot_stats.num_floors;
            merged_stats.sum_abs_shift += result.pivot_stats.sum_abs_shift;
            merged_stats.num_strict_fail += result.pivot_stats.num_strict_fail;
            merged_stats.max_abs_shift = merged_stats
                .max_abs_shift
                .max(result.pivot_stats.max_abs_shift);
            merged_stats.last_floor_value = merged_stats
                .last_floor_value
                .max(result.pivot_stats.last_floor_value);
            running_max = running_max.max(result.running_max);
            zero_pivots += result.zero_pivots;
        }

        self.pivot_stats = merged_stats;
        self.running_max_u = running_max;
        self.num_zero_pivots = zero_pivots;

        self.assemble_block_diagonal(n, block_results)?;
        Ok(())
    }

    #[cfg(not(feature = "rayon"))]
    fn compute_ilu0_block_parallel(&mut self, matrix: &Mat<f64>) -> Result<(), KError> {
        #[cfg(feature = "logging")]
        if self.config.logging_level > 0 && self.config.enable_parallel_factorization {
            warn!(
                "ILU parallel factorization requested but 'rayon' feature disabled; falling back to serial ILU0"
            );
        }
        self.compute_ilu0(matrix)
    }

    #[cfg(feature = "rayon")]
    fn assemble_block_diagonal(
        &mut self,
        n: usize,
        block_results: Vec<(std::ops::Range<usize>, BlockFactorResult)>,
    ) -> Result<(), KError> {
        let mut l_row_ptr = Vec::with_capacity(n + 1);
        let mut u_row_ptr = Vec::with_capacity(n + 1);
        l_row_ptr.push(0);
        u_row_ptr.push(0);
        let mut l_cols = Vec::new();
        let mut l_vals = Vec::new();
        let mut u_cols = Vec::new();
        let mut u_vals = Vec::new();
        let mut inv_diag_u = vec![0.0; n];
        let mut nnz_l = 0;
        let mut nnz_u = 0;

        for (rows, block) in block_results {
            let size = rows.len();
            for local_i in 0..size {
                let global_i = rows.start + local_i;
                let (cols_l, vals_l) = block.l.row(local_i);
                for (&c, &v) in cols_l.iter().zip(vals_l.iter()) {
                    l_cols.push(rows.start + c);
                    l_vals.push(v);
                    nnz_l += 1;
                }
                l_row_ptr.push(nnz_l);

                let (cols_u, vals_u) = block.u.row(local_i);
                let mut diag_found = false;
                for (&c, &v) in cols_u.iter().zip(vals_u.iter()) {
                    u_cols.push(rows.start + c);
                    u_vals.push(v);
                    if c == local_i {
                        if v == 0.0 {
                            return Err(KError::InvalidInput(format!(
                                "zero diagonal detected in block row {global_i}"
                            )));
                        }
                        inv_diag_u[global_i] = 1.0 / v;
                        diag_found = true;
                    }
                    nnz_u += 1;
                }
                if !diag_found {
                    return Err(KError::InvalidInput(format!(
                        "block ILU lost diagonal entry at row {global_i}"
                    )));
                }
                u_row_ptr.push(nnz_u);
            }
        }

        self.l = CsrMatrix::from_csr(n, n, l_row_ptr, l_cols, l_vals);
        self.u = CsrMatrix::from_csr(n, n, u_row_ptr, u_cols, u_vals);
        self.inv_diag_u = inv_diag_u;
        self.nnz_l = nnz_l;
        self.nnz_u = nnz_u;

        Ok(())
    }

    /// Compute Modified ILU(0) with row-sum correction and sparse storage
    fn compute_milu0(&mut self, matrix: &Mat<f64>) -> Result<(), KError> {
        let n = matrix.nrows();

        // Convert input matrix to sparse CSR format
        let drop_tol: f64 = 1e-15;
        let mut l = CsrMatrix::from_dense(matrix, drop_tol)?;
        let mut u = CsrMatrix::from_dense(matrix, drop_tol)?;

        // Store original row sums for diagonal correction
        let mut original_row_sums = vec![0.0; n];
        for i in 0..n {
            for j in 0..n {
                original_row_sums[i] = original_row_sums[i] + matrix[(i, j)];
            }
        }

        // Initialize L as lower triangular with unit diagonal, U as upper triangular
        for i in 0..n {
            for j in 0..n {
                if i > j {
                    Self::sparse_set(&mut u, i, j, 0.0);
                } else if i < j {
                    Self::sparse_set(&mut l, i, j, 0.0);
                } else {
                    Self::sparse_set(&mut l, i, i, 1.0);
                }
            }
        }

        // MILU(0) factorization with row-sum preservation
        for k in 0..n {
            let mut pivot = Self::sparse_get(&u, k, k);
            self.handle_pivot(&mut pivot, k, matrix)?;
            Self::sparse_set(&mut u, k, k, pivot);

            for i in (k + 1)..n {
                let l_ik = Self::sparse_get(&l, i, k);
                if l_ik != 0.0 {
                    let multiplier = l_ik / pivot;
                    Self::sparse_set(&mut l, i, k, multiplier);

                    let mut dropped_sum = 0.0;
                    for j in (k + 1)..n {
                        let u_kj = Self::sparse_get(&u, k, j);
                        if u_kj != 0.0 {
                            let update = multiplier * u_kj;
                            if matrix[(i, j)] != 0.0 {
                                let u_ij = Self::sparse_get(&u, i, j);
                                Self::sparse_set(&mut u, i, j, u_ij - update);
                            } else {
                                dropped_sum = dropped_sum + update;
                            }
                        }
                    }
                    // Apply diagonal correction for this row
                    let u_ii = Self::sparse_get(&u, i, i);
                    Self::sparse_set(&mut u, i, i, u_ii + dropped_sum);
                }
            }
        }

        self.nnz_l = l.nnz();
        self.nnz_u = u.nnz();

        self.inv_diag_u = u.diagonal().into_iter().map(|v| 1.0 / v).collect();

        self.l = l;
        self.u = u;

        Ok(())
    }

    /// Compute ILU(k) factorization with level-of-fill control
    fn compute_iluk(&mut self, matrix: &Mat<f64>) -> Result<(), KError> {
        let n = matrix.nrows();
        let mut l = Mat::zeros(n, n);
        let mut u = Mat::zeros(n, n);

        // Level-of-fill tracking: level[i][j] = fill level of entry (i,j)
        let mut level = vec![vec![usize::MAX; n]; n];

        // Initialize levels for original nonzeros
        for i in 0..n {
            for j in 0..n {
                if matrix[(i, j)] != 0.0 {
                    level[i][j] = 0;
                    if i <= j {
                        u[(i, j)] = matrix[(i, j)];
                    } else {
                        l[(i, j)] = matrix[(i, j)];
                    }
                }
            }
            l[(i, i)] = 1.0; // Unit diagonal for L
        }

        // ILU(k) factorization with fill-level control
        for k in 0..n {
            let mut pivot = u[(k, k)];
            self.handle_pivot(&mut pivot, k, matrix)?;
            u[(k, k)] = pivot;

            for i in (k + 1)..n {
                if level[i][k] <= self.config.level_of_fill {
                    l[(i, k)] = l[(i, k)] / pivot;

                    for j in (k + 1)..n {
                        if level[k][j] <= self.config.level_of_fill {
                            let new_level =
                                level[i][k].saturating_add(level[k][j]).saturating_add(1);

                            if new_level <= self.config.level_of_fill {
                                let update = l[(i, k)] * u[(k, j)];
                                u[(i, j)] = u[(i, j)] - update;
                                level[i][j] = level[i][j].min(new_level);
                            }
                        }
                    }
                } else {
                    l[(i, k)] = 0.0; // Drop high-level fill
                }
            }
        }

        // Convert to sparse format and cache inverse diagonal
        let drop_tol: f64 = 1e-15;
        self.l = CsrMatrix::from_dense(&l, drop_tol)?;
        self.u = CsrMatrix::from_dense(&u, drop_tol)?;

        self.inv_diag_u = (0..n).map(|i| 1.0 / u[(i, i)]).collect();

        self.nnz_l = self.l.nnz();
        self.nnz_u = self.u.nnz();

        Ok(())
    }

    /// Compute ILUT factorization with threshold-based dropping
    fn compute_ilut(&mut self, matrix: &Mat<f64>) -> Result<(), KError> {
        let n = matrix.nrows();
        let mut l = Mat::zeros(n, n);
        let mut u = Mat::zeros(n, n);
        let drop_tol: f64 = self.config.drop_tolerance;

        // Initialize with matrix values above drop tolerance
        for i in 0..n {
            for j in 0..n {
                let val = matrix[(i, j)];
                if val.abs() >= drop_tol {
                    if i <= j {
                        u[(i, j)] = val;
                    } else {
                        l[(i, j)] = val;
                    }
                }
            }
            l[(i, i)] = 1.0; // Unit diagonal for L
        }

        // ILUT factorization with threshold dropping and fill control
        for k in 0..n {
            let mut pivot = u[(k, k)];
            self.handle_pivot(&mut pivot, k, matrix)?;
            u[(k, k)] = pivot;

            // Collect potential updates for this elimination step
            let mut updates = Vec::new();

            for i in (k + 1)..n {
                if l[(i, k)].abs() >= drop_tol {
                    l[(i, k)] = l[(i, k)] / pivot;

                    for j in (k + 1)..n {
                        if u[(k, j)].abs() >= drop_tol {
                            let update = l[(i, k)] * u[(k, j)];
                            updates.push((i, j, update));
                        }
                    }
                }
            }

            // Apply updates with threshold dropping
            for (i, j, update) in updates {
                let new_val = u[(i, j)] - update;
                if new_val.abs() >= drop_tol {
                    u[(i, j)] = new_val;
                } else {
                    u[(i, j)] = 0.0; // Drop small entries
                }
            }

            // Apply fill-in control per row if specified
            if self.config.max_fill_per_row > 0 {
                for i in (k + 1)..n {
                    self.apply_fill_control_to_row(&mut u, i, k + 1);
                }
            }
        }

        // Convert to sparse format and cache inverse diagonal
        let drop_tol: f64 = 1e-15;
        self.l = CsrMatrix::from_dense(&l, drop_tol)?;
        self.u = CsrMatrix::from_dense(&u, drop_tol)?;

        self.inv_diag_u = (0..n).map(|i| 1.0 / u[(i, i)]).collect();

        self.nnz_l = self.l.nnz();
        self.nnz_u = self.u.nnz();

        Ok(())
    }

    /// Apply fill-in control to a single row, keeping only the largest entries
    fn apply_fill_control_to_row(&self, matrix: &mut Mat<f64>, row: usize, start_col: usize) {
        if self.config.max_fill_per_row == 0 {
            return;
        }

        // Collect (magnitude, column, value) for this row
        let mut entries: Vec<(f64, usize, f64)> = Vec::new();
        for j in start_col..matrix.ncols() {
            let val = matrix[(row, j)];
            if val != 0.0 {
                entries.push((val.abs(), j, val));
            }
        }

        if entries.len() > self.config.max_fill_per_row {
            // Sort by magnitude (largest first)
            entries.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));

            // Zero out all entries first
            for j in start_col..matrix.ncols() {
                matrix[(row, j)] = 0.0;
            }

            // Keep only the largest entries
            for i in 0..self.config.max_fill_per_row.min(entries.len()) {
                let (_, j, val) = entries[i];
                matrix[(row, j)] = val;
            }
        }
    }

    /// Compute the LU product for entry (i, j) using the provided CSR slices.
    fn lu_row_product(
        &self,
        i: usize,
        j: usize,
        l_row_ptr: &[usize],
        l_col_idx: &[usize],
        l_vals: &[f64],
        u_row_ptr: &[usize],
        u_col_idx: &[usize],
        u_vals: &[f64],
    ) -> f64 {
        let limit = i.min(j);
        let start_l = l_row_ptr[i];
        let end_l = l_row_ptr[i + 1];
        let mut sum = 0.0;

        for idx in start_l..end_l {
            let k = l_col_idx[idx];
            if k >= limit {
                break;
            }
            let lik = l_vals[idx];
            let start_u = u_row_ptr[k];
            let end_u = u_row_ptr[k + 1];
            if let Ok(off) = u_col_idx[start_u..end_u].binary_search(&j) {
                let ukj = u_vals[start_u + off];
                sum = f64::mul_add(lik, ukj, sum);
            }
        }

        sum
    }

    /// Single ParILU sweep (serial), returning the Frobenius-norm residual.
    #[allow(clippy::too_many_arguments)]
    fn parilu_sweep_serial(
        &self,
        a: &CsrMatrix<f64>,
        l_row_ptr: &[usize],
        l_col_idx: &[usize],
        l_old: &[f64],
        u_row_ptr: &[usize],
        u_col_idx: &[usize],
        u_old: &[f64],
        l_new: &mut [f64],
        u_new: &mut [f64],
        omega: f64,
    ) -> Result<f64, KError> {
        let n = a.nrows();
        let mut res_sq = 0.0;

        let get_u_diag = |j: usize| -> f64 {
            let start = u_row_ptr[j];
            let end = u_row_ptr[j + 1];
            for idx in start..end {
                if u_col_idx[idx] == j {
                    return u_old[idx];
                }
            }
            0.0
        };

        for i in 0..n {
            let (a_cols, a_vals) = a.row(i);

            // Lower part (strict)
            let l_start = l_row_ptr[i];
            let l_end = l_row_ptr[i + 1];
            for idx in l_start..l_end {
                let j = l_col_idx[idx];
                if j >= i {
                    continue;
                }

                let a_ij = match a_cols.binary_search(&j) {
                    Ok(pos) => a_vals[pos],
                    Err(_) => 0.0,
                };

                let s_ij = self.lu_row_product(
                    i, j, l_row_ptr, l_col_idx, l_old, u_row_ptr, u_col_idx, u_old,
                );

                let r_ij = a_ij - s_ij;

                let u_jj = get_u_diag(j);
                if u_jj == 0.0 {
                    return Err(KError::ZeroPivot(j));
                }

                let lij_old = l_old[idx];
                let lij_new = (1.0 - omega) * lij_old + omega * (r_ij / u_jj);
                l_new[idx] = lij_new;

                res_sq += r_ij * r_ij;
            }

            // Upper part (including diagonal)
            let u_start = u_row_ptr[i];
            let u_end = u_row_ptr[i + 1];
            for idx in u_start..u_end {
                let j = u_col_idx[idx];
                if j < i {
                    continue;
                }

                let a_ij = match a_cols.binary_search(&j) {
                    Ok(pos) => a_vals[pos],
                    Err(_) => 0.0,
                };

                let s_ij = self.lu_row_product(
                    i, j, l_row_ptr, l_col_idx, l_old, u_row_ptr, u_col_idx, u_old,
                );

                let r_ij = a_ij - s_ij;

                let uij_old = u_old[idx];
                let uij_new = (1.0 - omega) * uij_old + omega * r_ij;
                u_new[idx] = uij_new;

                res_sq += r_ij * r_ij;
            }
        }

        Ok(res_sq.sqrt())
    }

    /// ParILU sweep with optional rayon parallelism.
    #[allow(clippy::too_many_arguments)]
    fn parilu_sweep(
        &self,
        a: &CsrMatrix<f64>,
        l_row_ptr: &[usize],
        l_col_idx: &[usize],
        l_old: &[f64],
        u_row_ptr: &[usize],
        u_col_idx: &[usize],
        u_old: &[f64],
        l_new: &mut [f64],
        u_new: &mut [f64],
        omega: f64,
    ) -> Result<f64, KError> {
        #[cfg(feature = "rayon")]
        {
            if self.config.enable_parallel_factorization {
                return self.parilu_sweep_parallel(
                    a, l_row_ptr, l_col_idx, l_old, u_row_ptr, u_col_idx, u_old, l_new, u_new,
                    omega,
                );
            }
        }
        self.parilu_sweep_serial(
            a, l_row_ptr, l_col_idx, l_old, u_row_ptr, u_col_idx, u_old, l_new, u_new, omega,
        )
    }

    /// Row-parallel ParILU sweep (rayon).
    #[cfg(feature = "rayon")]
    #[allow(clippy::too_many_arguments)]
    fn parilu_sweep_parallel(
        &self,
        a: &CsrMatrix<f64>,
        l_row_ptr: &[usize],
        l_col_idx: &[usize],
        l_old: &[f64],
        u_row_ptr: &[usize],
        u_col_idx: &[usize],
        u_old: &[f64],
        l_new: &mut [f64],
        u_new: &mut [f64],
        omega: f64,
    ) -> Result<f64, KError> {
        let n = a.nrows();

        // Safety: row_ptr partitions the value slices by row, so indices written by
        // different iterations are disjoint. We pass raw addresses as usize to
        // satisfy Sync bounds on the parallel closure.
        let l_ptr = l_new.as_mut_ptr() as usize;
        let u_ptr = u_new.as_mut_ptr() as usize;

        let res_sq: Result<f64, KError> = (0..n)
            .into_par_iter()
            .map(|i| {
                let (a_cols, a_vals) = a.row(i);
                let mut res_sq_i = 0.0;

                let l_start = l_row_ptr[i];
                let l_end = l_row_ptr[i + 1];
                let u_start = u_row_ptr[i];
                let u_end = u_row_ptr[i + 1];

                let get_u_diag = |j: usize| -> f64 {
                    let start = u_row_ptr[j];
                    let end = u_row_ptr[j + 1];
                    for idx in start..end {
                        if u_col_idx[idx] == j {
                            return u_old[idx];
                        }
                    }
                    0.0
                };

                for idx in l_start..l_end {
                    let j = l_col_idx[idx];
                    if j >= i {
                        continue;
                    }

                    let a_ij = match a_cols.binary_search(&j) {
                        Ok(pos) => a_vals[pos],
                        Err(_) => 0.0,
                    };

                    let s_ij = self.lu_row_product(
                        i, j, l_row_ptr, l_col_idx, l_old, u_row_ptr, u_col_idx, u_old,
                    );
                    let r_ij = a_ij - s_ij;

                    let u_jj = get_u_diag(j);
                    if u_jj == 0.0 {
                        return Err(KError::ZeroPivot(j));
                    }

                    let lij_old = l_old[idx];
                    let lij_new = (1.0 - omega) * lij_old + omega * (r_ij / u_jj);
                    unsafe { *(l_ptr as *mut f64).add(idx) = lij_new };

                    res_sq_i += r_ij * r_ij;
                }

                for idx in u_start..u_end {
                    let j = u_col_idx[idx];
                    if j < i {
                        continue;
                    }

                    let a_ij = match a_cols.binary_search(&j) {
                        Ok(pos) => a_vals[pos],
                        Err(_) => 0.0,
                    };

                    let s_ij = self.lu_row_product(
                        i, j, l_row_ptr, l_col_idx, l_old, u_row_ptr, u_col_idx, u_old,
                    );
                    let r_ij = a_ij - s_ij;

                    let uij_old = u_old[idx];
                    let uij_new = (1.0 - omega) * uij_old + omega * r_ij;
                    unsafe { *(u_ptr as *mut f64).add(idx) = uij_new };

                    res_sq_i += r_ij * r_ij;
                }

                Ok(res_sq_i)
            })
            .try_reduce(|| 0.0, |a, b| Ok(a + b));

        Ok(res_sq?.sqrt())
    }

    /// ParILU refinement over the fixed sparsity pattern of (L, U).
    fn parilu_refine(&mut self, a: &CsrMatrix<f64>) -> Result<(), KError> {
        if !self.config.parilu_enabled || self.config.parilu_max_iters == 0 {
            self.history = None;
            return Ok(());
        }

        let n = a.nrows();
        if n == 0 {
            self.history = None;
            return Ok(());
        }

        if a.ncols() != n || self.l.nrows() != n || self.u.nrows() != n {
            return Err(KError::InvalidInput(
                "ParILU requires square matrices with matching dimensions".to_string(),
            ));
        }

        let max_iters = self.config.parilu_max_iters;
        let min_iters = self.config.parilu_min_iters.min(max_iters);
        let omega = self.config.parilu_omega;
        let tol = self.config.parilu_tol;

        // Capture pattern
        let l_row_ptr = self.l.row_ptr().to_vec();
        let l_col_idx = self.l.col_idx().to_vec();
        let mut l_vals = self.l.values().to_vec();

        let u_row_ptr = self.u.row_ptr().to_vec();
        let u_col_idx = self.u.col_idx().to_vec();
        let mut u_vals = self.u.values().to_vec();

        let mut l_old = l_vals.clone();
        let mut u_old = u_vals.clone();

        let mut history = ParIluHistory::with_capacity(max_iters);

        for iter in 0..max_iters {
            let iter_start = std::time::Instant::now();
            let res = self.parilu_sweep(
                a,
                &l_row_ptr,
                &l_col_idx,
                &l_old,
                &u_row_ptr,
                &u_col_idx,
                &u_old,
                &mut l_vals,
                &mut u_vals,
                omega,
            )?;
            let iter_time = iter_start.elapsed().as_secs_f64();

            history.push(ParIluIterSample {
                iter: iter as u32,
                residual: res,
                time_s: iter_time,
            });

            if let Some(m) = &self.monitor {
                if let Some(sample) = history.as_slice().last() {
                    m.on_event(Event::IluSetupIter { sample });
                }
            }

            if iter + 1 >= min_iters && res < tol {
                break;
            }

            l_old.copy_from_slice(&l_vals);
            u_old.copy_from_slice(&u_vals);
        }

        self.l = CsrMatrix::from_csr(n, n, l_row_ptr, l_col_idx, l_vals);
        self.u = CsrMatrix::from_csr(n, n, u_row_ptr, u_col_idx, u_vals);
        self.inv_diag_u = self.u.diagonal().into_iter().map(|v| 1.0 / v).collect();
        self.nnz_l = self.l.nnz();
        self.nnz_u = self.u.nnz();
        self.history = Some(history);

        #[cfg(feature = "logging")]
        if self.config.logging_level > 0 {
            if let Some(last) = self.history.as_ref().and_then(|h| h.as_slice().last()) {
                let converged = last.residual < tol;
                info!(
                    "ParILU refinement: iterations={}, final residual {:.3e} (tol {:.3e}, converged={})",
                    self.history
                        .as_ref()
                        .map(|h| h.as_slice().len())
                        .unwrap_or(0),
                    last.residual,
                    tol,
                    converged
                );
            }
        }

        Ok(())
    }

    /// Setup consolidated workspace for efficient operations (zero-allocation goal)
    fn setup_workspace(&mut self, n: usize) {
        debug_assert_eq!(
            self.l.nrows(),
            n,
            "L dimension mismatch during workspace sizing"
        );
        debug_assert_eq!(
            self.u.nrows(),
            n,
            "U dimension mismatch during workspace sizing"
        );

        if self.config.optimize_workspace {
            // Ensure workspace is properly sized (avoids reallocation if already correct size)
            self.workspace.ensure_size(n);

            #[cfg(feature = "logging")]
            if self.config.logging_level > 1 {
                debug!("ILU: Workspace configured for {n} x {n} matrix");
            }
        } else {
            // Still allocate minimal workspace for correctness
            self.workspace.ensure_size(n);
        }
    }

    /// Exact sparse triangular solve operating in-place on the provided buffer.
    fn solve_triangular_exact(&self, lower: bool, x: &mut [S]) {
        #[cfg(feature = "rayon")]
        if self.allow_parallel_triangular_solve(x.len()) {
            if lower {
                self.solve_triangular_parallel_forward(x);
            } else {
                self.solve_triangular_parallel_backward(x);
            }
            return;
        }

        self.solve_triangular_exact_seq(lower, x);
    }

    #[inline]
    fn solve_triangular_exact_seq(&self, lower: bool, x: &mut [S]) {
        let n = x.len();
        if lower {
            // Forward substitution: L * x = b (unit diagonal)
            for i in 0..n {
                let mut sum = x[i];
                let (cols, vals) = self.l.row(i);
                for (&j, &val) in cols.iter().zip(vals.iter()) {
                    if j < i {
                        sum -= val * x[j];
                    }
                }
                x[i] = sum;
            }
        } else {
            // Backward substitution: U * x = b
            for i in (0..n).rev() {
                let mut sum = x[i];
                let (cols, vals) = self.u.row(i);
                for (&j, &val) in cols.iter().zip(vals.iter()) {
                    if j <= i {
                        continue;
                    }
                    sum -= val * x[j];
                }
                x[i] = sum * self.inv_diag_u[i];
            }
        }
    }

    #[cfg(feature = "rayon")]
    fn solve_triangular_parallel_forward(&self, x: &mut [S]) {
        if x.is_empty() {
            return;
        }

        let chunk_size = self.config.parallel_chunk_size.max(1);
        let levels = &self.levels_l;
        if levels.buckets.is_empty() {
            return;
        }

        for rows in &levels.buckets {
            if rows.is_empty() {
                continue;
            }

            let updates: Vec<(usize, S)> = {
                let x_ref: &[S] = &*x;
                rows.par_iter()
                    .with_min_len(chunk_size)
                    .map(|&i| {
                        let mut sum = x_ref[i];
                        let (cols, vals) = self.l.row(i);
                        for (&j, &val) in cols.iter().zip(vals.iter()) {
                            if j < i {
                                sum -= val * x_ref[j];
                            }
                        }
                        (i, sum)
                    })
                    .collect()
            };

            for (i, val) in updates {
                x[i] = val;
            }
        }
    }

    #[cfg(feature = "rayon")]
    fn solve_triangular_parallel_backward(&self, x: &mut [S]) {
        if x.is_empty() {
            return;
        }

        let chunk_size = self.config.parallel_chunk_size.max(1);
        let levels = &self.levels_u;
        if levels.buckets.is_empty() {
            return;
        }

        for ell in 0..=levels.max_level {
            let rows = &levels.buckets[ell as usize];
            if rows.is_empty() {
                continue;
            }

            let updates: Vec<(usize, S)> = {
                let x_ref: &[S] = &*x;
                rows.par_iter()
                    .with_min_len(chunk_size)
                    .map(|&i| {
                        let mut sum = x_ref[i];
                        let (cols, vals) = self.u.row(i);
                        for (&j, &val) in cols.iter().zip(vals.iter()) {
                            if j <= i {
                                continue;
                            }
                            sum -= val * x_ref[j];
                        }
                        (i, sum * self.inv_diag_u[i])
                    })
                    .collect()
            };

            for (i, val) in updates {
                x[i] = val;
            }
        }
    }

    /// HYPRE-style iterative triangular solve with Jacobi and sparse access
    fn solve_triangular_jacobi(&self, lower: bool, b: &[S], x: &mut [S]) {
        let n = b.len();
        let num_iters = if lower {
            self.config.lower_jacobi_iters
        } else {
            self.config.upper_jacobi_iters
        };

        // Initialize
        x.copy_from_slice(b);

        for _iter in 0..num_iters {
            if lower {
                // Jacobi iteration for L * x = b
                for i in 0..n {
                    let mut sum = S::zero();
                    let (cols, vals) = self.l.row(i);
                    for (&j, &val) in cols.iter().zip(vals.iter()) {
                        if j < i {
                            sum = sum + val * x[j];
                        }
                    }
                    x[i] = b[i] - sum; // L has unit diagonal
                }
            } else {
                // Jacobi iteration for U * x = b
                for i in (0..n).rev() {
                    let mut sum = S::zero();
                    let (cols, vals) = self.u.row(i);
                    for (&j, &val) in cols.iter().zip(vals.iter()) {
                        if j > i {
                            sum = sum + val * x[j];
                        }
                    }
                    x[i] = (b[i] - sum) * self.inv_diag_u[i];
                }
            }
        }
    }

    /// HYPRE-style iterative triangular solve with Gauss-Seidel and sparse access
    fn solve_triangular_gauss_seidel(&self, lower: bool, b: &[S], x: &mut [S]) {
        let n = b.len();
        let num_iters = if lower {
            self.config.lower_jacobi_iters
        } else {
            self.config.upper_jacobi_iters
        };

        // Initialize
        x.copy_from_slice(b);

        for _iter in 0..num_iters {
            if lower {
                // Gauss-Seidel for L * x = b (forward sweep using updated values)
                for i in 0..n {
                    let mut sum = S::zero();
                    let (cols, vals) = self.l.row(i);
                    for (&j, &val) in cols.iter().zip(vals.iter()) {
                        if j < i {
                            sum = sum + val * x[j];
                        }
                    }
                    x[i] = b[i] - sum; // L has unit diagonal
                }
            } else {
                // Gauss-Seidel for U * x = b (backward sweep using updated values)
                for i in (0..n).rev() {
                    let mut sum = S::zero();
                    let (cols, vals) = self.u.row(i);
                    for (&j, &val) in cols.iter().zip(vals.iter()) {
                        if j > i {
                            sum = sum + val * x[j];
                        }
                    }
                    x[i] = (b[i] - sum) * self.inv_diag_u[i];
                }
            }
        }
    }

    /// Get factorization statistics (HYPRE-style diagnostics)
    pub fn get_stats(&self) -> IluStats {
        let (total_ns, count, _) = self.solve_ctrs.snapshot();
        let avg = if count == 0 {
            0.0
        } else {
            (total_ns as f64) / (count as f64) / 1e9
        };
        IluStats {
            setup_complexity: self.setup_complexity,
            nnz_l: self.nnz_l,
            nnz_u: self.nnz_u,
            num_zero_pivots: self.num_zero_pivots,
            setup_time: self.setup_time,
            solve_time: avg,
            solve_count: count as usize,
        }
    }

    /// Access pivot handling statistics
    pub fn pivot_stats(&self) -> &PivotStats {
        &self.pivot_stats
    }
}

#[cfg(not(feature = "complex"))]
impl Ilu {
    /// Create specialized ILU preconditioners that leverage existing implementations.
    ///
    /// - `IluType::ILUK`: uses classical ILU(p) (`Ilup`) for level-of-fill control.
    /// - `IluType::ILUT`: uses the unified `Ilu` implementation (`compute_ilut`).
    /// - Other variants: also use the unified `Ilu` implementation.
    pub fn create_specialized(
        config: IluConfig,
    ) -> Result<Box<dyn Preconditioner<Mat<S>, Vec<S>>>, KError> {
        match config.ilu_type {
            IluType::ILUK => {
                // Use the dedicated ILUP implementation for better performance
                let ilup = crate::preconditioner::ilup::Ilup::new(config.level_of_fill);
                Ok(Box::new(ilup))
            }
            IluType::ILUT | _ => {
                // Use the canonical Ilu implementation, including ILUT.
                let ilu = Ilu::new_with_config(config)?;
                Ok(Box::new(ilu))
            }
        }
    }

    /// Quick factory method for common ILU configurations
    pub fn create_quick(ilu_type: IluType, fill_or_drop: Real) -> Result<Self, KError> {
        let mut config = IluConfig::default();
        config.ilu_type = ilu_type;

        match ilu_type {
            IluType::ILUK => {
                config.level_of_fill = fill_or_drop as usize;
            }
            IluType::ILUT => {
                config.drop_tolerance = fill_or_drop;
                config.max_fill_per_row = 20; // Reasonable default
            }
            _ => {}
        }

        Self::new_with_config(config)
    }
}

/// ILU factorization statistics (HYPRE-inspired)
#[derive(Debug, Clone)]
pub struct IluStats {
    /// Setup complexity (total_nnz / original_nnz)
    pub setup_complexity: f64,
    /// Nonzeros in L factor
    pub nnz_l: usize,
    /// Nonzeros in U factor
    pub nnz_u: usize,
    /// Number of zero pivots encountered
    pub num_zero_pivots: usize,
    /// Setup time in seconds
    pub setup_time: f64,
    /// Average solve time in seconds
    pub solve_time: f64,
    /// Number of solves performed
    pub solve_count: usize,
}

impl Default for Ilu {
    fn default() -> Self {
        Self::new()
    }
}

impl Preconditioner<Mat<f64>, Vec<f64>> for Ilu {
    /// HYPRE-inspired setup with comprehensive safety checks and monitoring
    fn setup(&mut self, matrix: &Mat<f64>) -> Result<(), KError> {
        let setup_start = std::time::Instant::now();

        if let Some(m) = &self.monitor {
            m.on_event(Event::IluSetupBegin { opts_hash: 0 });
        }

        // HYPRE-style validation and safety checks
        Self::validate_matrix(matrix)?;

        if self.config.ieee_checks {
            Self::check_ieee_values(matrix)?;

            #[cfg(feature = "logging")]
            if self.config.logging_level > 0 {
                info!("ILU: IEEE safety checks passed");
            }
        }

        let mut conditioned = None;
        let matrix = if self.config.conditioning.is_active() {
            let mut local = matrix.clone();
            apply_dense_transforms("ILU", &mut local, &self.config.conditioning)?;
            conditioned = Some(local);
            conditioned.as_ref().unwrap()
        } else {
            matrix
        };

        let n = matrix.nrows();
        let a_csr = CsrMatrix::from_dense(matrix, 1e-15)?;
        let original_nnz = a_csr.nnz();

        #[cfg(feature = "logging")]
        print_ilu_banner(&self.config);

        // Precompute scaling terms for pivoting
        let mut max_diag: Real = Real::default();
        self.row_inf_a.resize(n, Real::default());
        self.row_gersh_a.resize(n, Real::default());
        for i in 0..n {
            let mut row_inf: Real = Real::default();
            let mut row_gersh = matrix[(i, i)].abs();
            for j in 0..n {
                let val_abs = matrix[(i, j)].abs();
                if j != i {
                    row_gersh = row_gersh + val_abs;
                }
                if val_abs > row_inf {
                    row_inf = val_abs;
                }
            }
            self.row_inf_a[i] = row_inf;
            self.row_gersh_a[i] = row_gersh;
            max_diag = max_diag.max(matrix[(i, i)].abs());
        }
        self.max_diag_a = max_diag;
        self.running_max_u = Real::default();
        self.pivot_stats = PivotStats::default();
        self.history = None;

        #[cfg(feature = "logging")]
        if self.config.logging_level > 0 {
            info!("ILU Setup: {n} x {n} matrix with {original_nnz} nonzeros");
            debug!("ILU: Using {:?} factorization type", self.config.ilu_type);
        }

        let mut parilu_iters = 0u32;
        let mut parilu_converged = true;

        // Perform factorization based on type
        let par_mode = if self.allow_parallel_factorization(n) {
            self.config.par_factor_mode()
        } else {
            ParFactorizationMode::Serial
        };

        match (self.config.ilu_type, par_mode) {
            (IluType::ILU0, ParFactorizationMode::Serial) => {
                self.compute_ilu0(matrix)?;
            }
            (IluType::ILU0, ParFactorizationMode::Block)
            | (IluType::ILU0, ParFactorizationMode::ParIlu) => {
                self.compute_ilu0_block_parallel(matrix)?;
            }
            (IluType::MILU0, _) => {
                self.compute_milu0(matrix)?;
            }
            (IluType::ILUK, _) => {
                self.compute_iluk(matrix)?;
            }
            (IluType::ILUT, _) => {
                self.compute_ilut(matrix)?;
            }
            _ => {
                return Err(KError::NotImplemented(format!(
                    "ILU type {:?} not yet implemented",
                    self.config.ilu_type
                )));
            }
        }

        if self.config.parilu_enabled && self.config.parilu_max_iters > 0 {
            self.parilu_refine(&a_csr)?;
            if let Some(hist) = &self.history {
                parilu_iters = hist.as_slice().len() as u32;
                parilu_converged = hist
                    .as_slice()
                    .last()
                    .map(|s| s.residual < self.config.parilu_tol)
                    .unwrap_or(false);
            }
        } else {
            self.history = None;
        }

        // Setup workspace for iterative solves (must match factor dimensions)
        self.setup_workspace(n);

        // Calculate metrics
        self.setup_complexity = self.calculate_complexity(original_nnz);
        self.setup_time = setup_start.elapsed().as_secs_f64();

        #[cfg(feature = "rayon")]
        if self.allow_parallel_triangular_solve(n) {
            self.levels_l = build_levels_lower(&self.l);
            self.levels_u = build_levels_upper(&self.u);
        }

        #[cfg(feature = "logging")]
        if self.config.logging_level > 0 {
            info!(
                "ILU Setup Complete: complexity={:.2}, L_nnz={}, U_nnz={}, setup_time={:.3}s",
                self.setup_complexity, self.nnz_l, self.nnz_u, self.setup_time
            );

            if let Some(hist) = &self.history {
                if let Some(last) = hist.as_slice().last() {
                    info!(
                        "ParILU refinement: sweeps={}, final residual {:.2e} (tol {:.2e})",
                        hist.as_slice().len(),
                        last.residual,
                        self.config.parilu_tol
                    );
                }
            }

            debug!(
                "Pivot floors: {} (max shift {:.3e})",
                self.pivot_stats.num_floors, self.pivot_stats.max_abs_shift
            );

            if self.num_zero_pivots > 0 {
                warn!(
                    "ILU: {} zero pivots encountered during factorization",
                    self.num_zero_pivots
                );
            }

            if self.config.print_level > 0 {
                println!(
                    "ILU Setup: {} -> {} nonzeros (complexity: {:.2})",
                    original_nnz,
                    self.nnz_l + self.nnz_u,
                    self.setup_complexity
                );
            }
        }
        if let Some(m) = &self.monitor {
            m.on_event(Event::IluSetupEnd {
                iters: parilu_iters,
                converged: parilu_converged,
                setup_time_s: self.setup_time,
            });
        }

        Ok(())
    }

    /// HYPRE-inspired apply with configurable triangular solves and zero-allocation workspace.
    /// No heap allocations occur after `setup()` has completed.
    fn apply(&self, side: PcSide, x: &Vec<f64>, y: &mut Vec<f64>) -> Result<(), KError> {
        self.apply_slice(side, x.as_slice(), y.as_mut_slice())
    }
}

impl Ilu {
    fn apply_slice(&self, _side: PcSide, x: &[f64], y: &mut [f64]) -> Result<(), KError> {
        let n = self.l.nrows();
        if x.len() != n || y.len() != n {
            return Err(KError::InvalidInput(format!(
                "Vector length mismatch: expected {}, got x={} y={}",
                n,
                x.len(),
                y.len(),
            )));
        }

        let _timer = SolveTimer::start(&self.solve_ctrs);

        match self.config.triangular_solve {
            TriSolveType::Exact => {
                // single copy mandated by API
                y.copy_from_slice(x);
                self.solve_triangular_exact(true, y);
                self.solve_triangular_exact(false, y);
            }
            TriSolveType::Jacobi => {
                let mut buf = self.workspace.borrow_solve_buf(n);
                self.solve_triangular_jacobi(true, x, &mut buf[..n]);
                self.solve_triangular_jacobi(false, &buf[..n], y);
            }
            TriSolveType::GaussSeidel => {
                let mut buf = self.workspace.borrow_solve_buf(n);
                self.solve_triangular_gauss_seidel(true, x, &mut buf[..n]);
                self.solve_triangular_gauss_seidel(false, &buf[..n], y);
            }
        }

        #[cfg(feature = "logging")]
        if self.config.logging_level > 2 {
            let _solve_time = _timer.elapsed().as_secs_f64();
            trace!(
                "ILU Apply: solve_time={:.6}s, workspace_size={}",
                _solve_time, self.workspace.size
            );
        }

        Ok(())
    }
}

impl TriangularSolve<f64> for Ilu {
    fn solve_lower_in_place(&self, x: &mut [f64]) -> Result<(), KError> {
        self.solve_triangular_exact(true, x);
        Ok(())
    }

    fn solve_upper_in_place(&self, x: &mut [f64]) -> Result<(), KError> {
        self.solve_triangular_exact(false, x);
        Ok(())
    }
}

impl Ilu {
    pub fn parilu_history(&self) -> Option<&[ParIluIterSample]> {
        self.history.as_ref().map(|h| h.as_slice())
    }

    pub fn set_monitor(&mut self, m: Option<Box<dyn Monitor>>) {
        self.monitor = m;
    }
}

impl LocalPreconditioner<f64> for Ilu {
    fn dims(&self) -> (usize, usize) {
        (self.l.nrows(), self.l.ncols())
    }

    fn apply_local(&self, x: &[S], y: &mut [S]) -> Result<(), KError> {
        let (n, _) = LocalPreconditioner::<f64>::dims(self);
        debug_assert_eq!(x.len(), n);
        debug_assert_eq!(y.len(), n);
        self.apply_slice(PcSide::Left, x, y)
    }
}

#[cfg(feature = "complex")]
impl KPreconditioner for Ilu {
    type Scalar = GlobalScalar;

    #[inline]
    fn dims(&self) -> (usize, usize) {
        LocalPreconditioner::<f64>::dims(self)
    }

    fn apply_s(
        &self,
        side: PcSide,
        x: &[GlobalScalar],
        y: &mut [GlobalScalar],
        scratch: &mut BridgeScratch,
    ) -> Result<(), KError> {
        let (rows, cols) = LocalPreconditioner::<f64>::dims(self);
        let n = x.len();
        if x.len() != y.len() || rows != n || cols != n {
            return Err(KError::InvalidInput(format!(
                "Ilu::apply_s dimension mismatch: expected {}x{}, got x.len()={} y.len()={}",
                rows,
                cols,
                x.len(),
                y.len()
            )));
        }

        scratch.with_pair(n, |xr, yr| {
            copy_scalar_to_real_in(x, xr);
            self.apply_slice(side, xr, yr)?;
            copy_real_into_scalar(yr, y);
            Ok(())
        })
    }

    fn apply_mut_s(
        &mut self,
        side: PcSide,
        x: &[GlobalScalar],
        y: &mut [GlobalScalar],
        scratch: &mut BridgeScratch,
    ) -> Result<(), KError> {
        KPreconditioner::apply_s(self, side, x, y, scratch)
    }

    fn on_restart_s(
        &mut self,
        _outer_iter: usize,
        _residual_norm: <GlobalScalar as KrystScalar>::Real,
    ) -> Result<(), KError> {
        Ok(())
    }
}

/// Legacy ILU(0) type alias for backward compatibility
pub type Ilu0 = Ilu;

#[cfg(test)]
mod tests {
    use super::{Ilu, IluBuilder, IluConfig, IluType, TriSolveType};
    use crate::algebra::parallel::par_sum_abs2_local;
    use crate::algebra::prelude::*;
    use crate::error::KError;
    use crate::preconditioner::PcSide;
    use crate::preconditioner::legacy::Preconditioner;
    #[cfg(not(feature = "complex"))]
    use rand::{Rng, SeedableRng, rngs::StdRng};

    #[cfg(feature = "rayon")]
    use rayon::prelude::*;
    #[cfg(feature = "rayon")]
    use std::sync::Arc;

    fn make_spd_3x3() -> faer::Mat<S> {
        // A = [[4, 1, 0],
        //      [1, 3, 1],
        //      [0, 1, 2]]
        faer::Mat::from_fn(3, 3, |i, j| match (i, j) {
            (0, 0) => S::from_real(4.0),
            (0, 1) | (1, 0) => S::from_real(1.0),
            (1, 1) => S::from_real(3.0),
            (1, 2) | (2, 1) => S::from_real(1.0),
            (2, 2) => S::from_real(2.0),
            _ => S::zero(),
        })
    }

    fn mat_vec_mul(a: &faer::Mat<S>, x: &[S]) -> Vec<S> {
        let mut out = vec![S::zero(); a.nrows()];
        for i in 0..a.nrows() {
            let mut acc = S::zero();
            for j in 0..a.ncols() {
                acc = acc + a[(i, j)] * x[j];
            }
            out[i] = acc;
        }
        out
    }

    #[cfg(not(feature = "complex"))]
    fn mat_vec_mul_inplace(a: &faer::Mat<S>, x: &[S], y: &mut [S]) {
        for i in 0..a.nrows() {
            let mut acc = S::zero();
            for j in 0..a.ncols() {
                acc = acc + a[(i, j)] * x[j];
            }
            y[i] = acc;
        }
    }

    #[cfg(not(feature = "complex"))]
    fn random_spd(n: usize, seed: u64) -> faer::Mat<S> {
        let mut rng = StdRng::seed_from_u64(seed);
        let mut a = faer::Mat::zeros(n, n);
        for i in 0..n {
            for j in 0..=i {
                let v = rng.gen_range(-1.0..1.0);
                a[(i, j)] = S::from_real(v);
                a[(j, i)] = S::from_real(v);
            }
        }
        for i in 0..n {
            a[(i, i)] = a[(i, i)] + S::from_real(n as f64 + 1.0);
        }
        a
    }

    #[cfg(not(feature = "complex"))]
    fn cg_unpreconditioned(a: &faer::Mat<S>, b: &[S], x0: &[S], max_iter: usize) -> (usize, f64) {
        let n = b.len();
        let mut x = x0.to_vec();
        let mut r = vec![S::zero(); n];
        let mut p = vec![S::zero(); n];
        let mut ap = vec![S::zero(); n];

        mat_vec_mul_inplace(a, &x, &mut r);
        for i in 0..n {
            r[i] = b[i] - r[i];
        }
        p.copy_from_slice(&r);
        let mut rr = dot(&r, &r);

        let mut iters = 0;
        while iters < max_iter && rr > 1e-20 {
            mat_vec_mul_inplace(a, &p, &mut ap);
            let denom = dot(&p, &ap);
            if denom.abs() < 1e-30 {
                break;
            }
            let alpha = rr / denom;
            for i in 0..n {
                x[i] = x[i] + alpha * p[i];
                r[i] = r[i] - alpha * ap[i];
            }
            let rr_new = dot(&r, &r);
            if rr_new.sqrt() < 1e-10 {
                rr = rr_new;
                iters += 1;
                break;
            }
            let beta = rr_new / rr;
            for i in 0..n {
                p[i] = r[i] + beta * p[i];
            }
            rr = rr_new;
            iters += 1;
        }

        (iters, rr.sqrt())
    }

    #[cfg(not(feature = "complex"))]
    fn cg_left_preconditioned(
        a: &faer::Mat<S>,
        pc: &Ilu,
        b: &[S],
        x0: &[S],
        max_iter: usize,
    ) -> (usize, f64) {
        let n = b.len();
        let mut x = x0.to_vec();
        let mut r = vec![S::zero(); n];
        let mut z = vec![S::zero(); n];
        let mut p = vec![S::zero(); n];
        let mut ap = vec![S::zero(); n];

        mat_vec_mul_inplace(a, &x, &mut r);
        for i in 0..n {
            r[i] = b[i] - r[i];
        }
        pc.apply(PcSide::Left, &r, &mut z).expect("pc apply");
        p.copy_from_slice(&z);
        let mut rz = dot(&r, &z);

        let mut iters = 0;
        while iters < max_iter && rz.abs() > 1e-20 {
            mat_vec_mul_inplace(a, &p, &mut ap);
            let denom = dot(&p, &ap);
            if denom.abs() < 1e-30 {
                break;
            }
            let alpha = rz / denom;
            for i in 0..n {
                x[i] = x[i] + alpha * p[i];
                r[i] = r[i] - alpha * ap[i];
            }
            let r_norm = dot(&r, &r).sqrt();
            if r_norm < 1e-10 {
                rz = r_norm;
                iters += 1;
                break;
            }
            pc.apply(PcSide::Left, &r, &mut z).expect("pc apply");
            let rz_new = dot(&r, &z);
            let beta = rz_new / rz;
            for i in 0..n {
                p[i] = z[i] + beta * p[i];
            }
            rz = rz_new;
            iters += 1;
        }

        (iters, dot(&r, &r).sqrt())
    }

    #[cfg(not(feature = "complex"))]
    fn dot(x: &[S], y: &[S]) -> f64 {
        x.iter().zip(y.iter()).map(|(&a, &b)| a * b).sum()
    }

    #[cfg(feature = "rayon")]
    fn make_tridiag_matrix(n: usize) -> faer::Mat<f64> {
        faer::Mat::from_fn(n, n, |i, j| {
            if i == j {
                4.0
            } else if (i as isize - j as isize).abs() == 1 {
                -1.0
            } else {
                0.0
            }
        })
    }

    #[test]
    fn test_ilu_default_creation() {
        let ilu = Ilu::new();
        assert_eq!(ilu.config.ilu_type, IluType::ILU0);
    }

    #[test]
    fn test_ilu_builder() {
        let ilu = IluBuilder::new()
            .ilu_type(IluType::ILUT)
            .drop_tolerance(1e-6)
            .enable_logging()
            .build()
            .unwrap();

        assert_eq!(ilu.config.ilu_type, IluType::ILUT);
        assert_eq!(ilu.config.drop_tolerance, 1e-6);
        assert_eq!(ilu.config.logging_level, 1);
    }

    #[test]
    fn test_ilu_config_validation() {
        let mut config = IluConfig::default();
        config.drop_tolerance = -1.0;

        let result = Ilu::new_with_config(config);
        assert!(result.is_err());
    }

    #[test]
    fn test_ilu0_simple_matrix() {
        let matrix = faer::Mat::from_fn(3, 3, |i, j| {
            if i == j {
                4.0
            } else if (i as i32 - j as i32).abs() == 1 {
                -1.0
            } else {
                0.0
            }
        });

        let mut ilu = Ilu::new();
        use crate::preconditioner::legacy::Preconditioner;
        let result = ilu.setup(&matrix);
        assert!(result.is_ok());

        let stats = ilu.get_stats();
        assert!(stats.setup_complexity > 0.0);
        assert_eq!(stats.num_zero_pivots, 0);
    }

    #[test]
    fn test_enhanced_pivot_handling() {
        let matrix = faer::Mat::from_fn(3, 3, |i, j| {
            if i == j && i == 1 {
                1e-15 // Very small pivot
            } else if i == j {
                1.0
            } else {
                0.0
            }
        });

        // Test pivot policy with default settings
        let config = IluConfig::default();
        let mut ilu = Ilu::new_with_config(config).unwrap();
        use crate::preconditioner::legacy::Preconditioner;
        let result = ilu.setup(&matrix);
        assert!(result.is_ok());
        assert!(ilu.pivot_stats().num_floors > 0);
    }

    #[cfg(not(feature = "complex"))]
    #[test]
    fn test_ilu_variants() {
        let _matrix = faer::Mat::from_fn(3, 3, |i, j| {
            if i == j {
                4.0
            } else if (i as i32 - j as i32).abs() == 1 {
                -1.0
            } else {
                0.0
            }
        });

        // Test ILU(k)
        let ilu_k = Ilu::create_quick(IluType::ILUK, 1.0).unwrap();
        assert_eq!(ilu_k.config.ilu_type, IluType::ILUK);
        assert_eq!(ilu_k.config.level_of_fill, 1);

        // Test ILUT
        let ilu_t = Ilu::create_quick(IluType::ILUT, 1e-6).unwrap();
        assert_eq!(ilu_t.config.ilu_type, IluType::ILUT);
        assert_eq!(ilu_t.config.drop_tolerance, 1e-6);
    }

    #[test]
    fn test_triangular_solve_options() {
        let matrix = faer::Mat::from_fn(2, 2, |i, j| if i == j { 2.0 } else { 0.0 });

        // Test Gauss-Seidel solve
        let mut config = IluConfig::default();
        config.triangular_solve = TriSolveType::GaussSeidel;
        config.lower_jacobi_iters = 2;
        config.upper_jacobi_iters = 2;

        let mut ilu = Ilu::new_with_config(config).unwrap();
        use crate::preconditioner::legacy::Preconditioner;
        let result = ilu.setup(&matrix);
        assert!(result.is_ok());
    }

    #[cfg(not(feature = "complex"))]
    #[test]
    fn test_specialized_factory() {
        let config = IluConfig {
            ilu_type: IluType::ILUK,
            level_of_fill: 2,
            ..Default::default()
        };

        let ilu_box = Ilu::create_specialized(config);
        assert!(ilu_box.is_ok());
    }

    #[test]
    fn test_parallel_configuration() {
        let ilu = IluBuilder::new()
            .enable_parallel()
            .parallel_chunk_size(128)
            .build()
            .unwrap();

        assert!(ilu.config.enable_parallel_factorization);
        assert!(ilu.config.enable_parallel_triangular_solve);
        assert_eq!(ilu.config.parallel_chunk_size, 128);
    }

    #[test]
    fn test_workspace_optimization() {
        let matrix = faer::Mat::from_fn(3, 3, |i, j| {
            if i == j {
                4.0
            } else if (i as i32 - j as i32).abs() == 1 {
                -1.0
            } else {
                0.0
            }
        });

        let mut ilu = IluBuilder::new().ilu_type(IluType::ILU0).build().unwrap();

        use crate::preconditioner::legacy::Preconditioner;
        let result = ilu.setup(&matrix);
        assert!(result.is_ok());

        // Workspace should be allocated if optimization is enabled
        assert!(ilu.workspace.size > 0);

        // Test that apply works with consolidated workspace
        let x = vec![1.0, 2.0, 3.0];
        let mut y = vec![0.0; 3];
        use crate::preconditioner::PcSide;
        let apply_result = ilu.apply(PcSide::Left, &x, &mut y);
        assert!(apply_result.is_ok());
    }

    #[cfg(feature = "rayon")]
    #[test]
    fn test_parallel_factorization() {
        let matrix = faer::Mat::from_fn(10, 10, |i, j| {
            if i == j {
                4.0
            } else if (i as i32 - j as i32).abs() == 1 {
                -1.0
            } else {
                0.0
            }
        });

        let mut ilu_serial = IluBuilder::new().ilu_type(IluType::ILU0).build().unwrap();

        let mut ilu_parallel = IluBuilder::new()
            .ilu_type(IluType::ILU0)
            .enable_parallel_factorization()
            .parallel_chunk_size(2) // Small chunk size to force parallel execution
            .build()
            .unwrap();

        use crate::preconditioner::legacy::Preconditioner;

        let serial_result = ilu_serial.setup(&matrix);
        assert!(serial_result.is_ok());

        let parallel_result = ilu_parallel.setup(&matrix);
        assert!(parallel_result.is_ok());

        let serial_stats = ilu_serial.get_stats();
        let parallel_stats = ilu_parallel.get_stats();

        assert!(
            parallel_stats.nnz_l <= serial_stats.nnz_l,
            "block ILU should not increase L nz count: serial={} parallel={}",
            serial_stats.nnz_l,
            parallel_stats.nnz_l
        );
        assert!(
            parallel_stats.nnz_u <= serial_stats.nnz_u,
            "block ILU should not increase U nz count: serial={} parallel={}",
            serial_stats.nnz_u,
            parallel_stats.nnz_u
        );
    }

    #[cfg(feature = "rayon")]
    #[test]
    fn block_ilu_single_block_matches_serial() {
        let n = 6;
        let matrix = make_tridiag_matrix(n);

        let mut ilu_serial = IluBuilder::new().ilu_type(IluType::ILU0).build().unwrap();
        ilu_serial.setup(&matrix).unwrap();

        let mut config = IluConfig::default();
        config.enable_parallel_factorization = true;
        config.parallel_chunk_size = n;
        let mut ilu_block = Ilu::new_with_config(config).unwrap();
        ilu_block.setup(&matrix).unwrap();

        let serial_stats = ilu_serial.get_stats();
        let block_stats = ilu_block.get_stats();
        assert_eq!(
            block_stats.nnz_l, serial_stats.nnz_l,
            "single-block parallel should match serial L pattern"
        );
        assert_eq!(
            block_stats.nnz_u, serial_stats.nnz_u,
            "single-block parallel should match serial U pattern"
        );

        let rhs: Vec<f64> = (0..n).map(|i| (i + 1) as f64).collect();
        let mut y_serial = vec![0.0; n];
        let mut y_block = vec![0.0; n];
        ilu_serial.apply(PcSide::Left, &rhs, &mut y_serial).unwrap();
        ilu_block.apply(PcSide::Left, &rhs, &mut y_block).unwrap();

        for (&a, &b) in y_serial.iter().zip(y_block.iter()) {
            assert!((a - b).abs() < 1e-12, "serial {} vs block {}", a, b);
        }
    }

    #[cfg(feature = "rayon")]
    #[test]
    fn block_ilu_chunk_size_one_is_diag() {
        let n = 5;
        let matrix = make_tridiag_matrix(n);

        let mut config = IluConfig::default();
        config.enable_parallel_factorization = true;
        config.parallel_chunk_size = 1;
        let mut ilu_block = Ilu::new_with_config(config).unwrap();
        ilu_block.setup(&matrix).unwrap();

        let stats = ilu_block.get_stats();
        assert_eq!(stats.nnz_l, n);
        assert_eq!(stats.nnz_u, n);

        let rhs = vec![1.0; n];
        let mut sol = vec![0.0; n];
        ilu_block.apply(PcSide::Left, &rhs, &mut sol).unwrap();
        for &val in sol.iter() {
            assert!(!val.is_nan());
        }
    }

    #[cfg(feature = "rayon")]
    #[test]
    fn triangular_parallel_matches_sequential() {
        let matrix = make_tridiag_matrix(5);

        let mut cfg_seq = IluConfig::default();
        cfg_seq.triangular_solve = TriSolveType::Exact;
        cfg_seq.enable_parallel_triangular_solve = false;

        let mut cfg_par = IluConfig::default();
        cfg_par.triangular_solve = TriSolveType::Exact;
        cfg_par.enable_parallel_triangular_solve = true;
        cfg_par.parallel_chunk_size = 1;

        let mut ilu_seq = Ilu::new_with_config(cfg_seq).unwrap();
        ilu_seq.setup(&matrix).unwrap();

        let mut ilu_par = Ilu::new_with_config(cfg_par).unwrap();
        ilu_par.setup(&matrix).unwrap();

        let n = matrix.nrows();
        let x: Vec<f64> = (0..n).map(|i| (i + 1) as f64).collect();

        let mut y_seq = vec![0.0; n];
        let mut y_par = vec![0.0; n];

        ilu_seq.apply(PcSide::Left, &x, &mut y_seq).unwrap();
        ilu_par.apply(PcSide::Left, &x, &mut y_par).unwrap();

        for (&seq_val, &par_val) in y_seq.iter().zip(y_par.iter()) {
            assert!(
                (seq_val - par_val).abs() < 1e-12,
                "seq={seq_val}, par={par_val} for n={n}"
            );
        }
    }

    #[cfg(feature = "rayon")]
    #[test]
    fn triangular_parallel_matches_sequential_large() {
        let n = 64;
        let matrix = make_tridiag_matrix(n);

        let mut cfg_seq = IluConfig::default();
        cfg_seq.triangular_solve = TriSolveType::Exact;
        cfg_seq.enable_parallel_triangular_solve = false;

        let mut cfg_par = IluConfig::default();
        cfg_par.triangular_solve = TriSolveType::Exact;
        cfg_par.enable_parallel_triangular_solve = true;
        cfg_par.parallel_chunk_size = 16;

        let mut ilu_seq = Ilu::new_with_config(cfg_seq).unwrap();
        ilu_seq.setup(&matrix).unwrap();

        let mut ilu_par = Ilu::new_with_config(cfg_par).unwrap();
        ilu_par.setup(&matrix).unwrap();

        let x: Vec<f64> = (0..n).map(|i| (i + 1) as f64).collect();

        let mut y_seq = vec![0.0; n];
        let mut y_par = vec![0.0; n];

        ilu_seq.apply(PcSide::Left, &x, &mut y_seq).unwrap();
        ilu_par.apply(PcSide::Left, &x, &mut y_par).unwrap();

        for (&seq_val, &par_val) in y_seq.iter().zip(y_par.iter()) {
            assert!(
                (seq_val - par_val).abs() < 1e-10,
                "seq={seq_val}, par={par_val} for n={n}"
            );
        }
    }

    #[test]
    fn test_distributed_configuration() {
        let ilu = IluBuilder::new().enable_distributed().build().unwrap();

        assert!(ilu.config.enable_distributed);
    }

    #[test]
    #[cfg(not(feature = "complex"))]
    fn ilu0_real_factorization_solves_spd() {
        let matrix = make_spd_3x3();
        let x_true = vec![S::from_real(1.0), S::from_real(2.0), S::from_real(-1.0)];
        let b = mat_vec_mul(&matrix, &x_true);

        let mut ilu = Ilu::new();
        ilu.setup(&matrix).expect("ILU(0) setup");

        let mut x = vec![S::zero(); b.len()];
        ilu.apply(PcSide::Left, &b, &mut x).expect("ILU(0) apply");

        let r: Vec<S> = mat_vec_mul(&matrix, &x)
            .into_iter()
            .zip(b.iter())
            .map(|(ax, &bi)| ax - bi)
            .collect();
        let res_norm = par_sum_abs2_local(&r).sqrt();
        assert!(
            res_norm.real() < 1e-10,
            "residual too large: {:?}",
            res_norm
        );
    }

    #[cfg(not(feature = "complex"))]
    #[test]
    fn ilu0_improves_cg_for_random_spd() {
        let n = 20;
        let a = random_spd(n, 12345);
        let b = vec![S::from_real(1.0); n];
        let x0 = vec![S::zero(); n];

        let (iters_unpre, res_unpre) = cg_unpreconditioned(&a, &b, &x0, 200);

        let mut ilu = Ilu::new();
        ilu.setup(&a).expect("ilu0 setup");
        let (iters_pc, res_pc) = cg_left_preconditioned(&a, &ilu, &b, &x0, 200);

        assert!(
            res_pc < res_unpre * 0.5 || iters_pc < iters_unpre,
            "preconditioned res={res_pc} iters={iters_pc}, baseline res={res_unpre} iters={iters_unpre}"
        );
    }

    #[test]
    fn parilu_refines_residual() {
        let matrix = faer::Mat::<f64>::from_fn(3, 3, |i, j| match (i, j) {
            (0, 0) => 4.0,
            (0, 1) | (1, 0) => 1.0,
            (1, 1) => 3.0,
            (1, 2) | (2, 1) => 1.0,
            (2, 2) => 2.0,
            _ => 0.0,
        });
        let rhs = vec![1.0f64; 3];

        let mut baseline = Ilu::new();
        baseline.setup(&matrix).expect("baseline ILU");
        let mut y_base = vec![0.0; 3];
        baseline
            .apply(PcSide::Left, &rhs, &mut y_base)
            .expect("apply baseline");
        let res_base = {
            let mut sum = 0.0;
            for i in 0..matrix.nrows() {
                let mut ax = 0.0;
                for j in 0..matrix.ncols() {
                    ax += matrix[(i, j)] * y_base[j];
                }
                let r = ax - rhs[i];
                sum += r * r;
            }
            sum.sqrt()
        };

        let mut cfg = IluConfig::default();
        cfg.parilu_enabled = true;
        cfg.parilu_max_iters = 5;
        cfg.parilu_min_iters = 1;
        cfg.parilu_tol = 1e-8;
        let mut parilu = Ilu::new_with_config(cfg).unwrap();
        parilu.setup(&matrix).expect("parilu ILU");
        let mut y_parilu = vec![0.0; 3];
        parilu
            .apply(PcSide::Left, &rhs, &mut y_parilu)
            .expect("apply parilu");
        let res_parilu = {
            let mut sum = 0.0;
            for i in 0..matrix.nrows() {
                let mut ax = 0.0;
                for j in 0..matrix.ncols() {
                    ax += matrix[(i, j)] * y_parilu[j];
                }
                let r = ax - rhs[i];
                sum += r * r;
            }
            sum.sqrt()
        };

        assert!(
            res_parilu <= res_base + 1e-10,
            "ParILU should not worsen residual (baseline {res_base}, parilu {res_parilu})"
        );

        let hist = parilu.parilu_history().unwrap();
        assert!(!hist.is_empty());
    }

    #[test]
    fn ilu_rejects_nan_inf_when_ieee_checks_enabled() {
        let mut a = faer::Mat::zeros(3, 3);
        a[(0, 0)] = 1.0;
        a[(1, 1)] = f64::NAN;
        a[(2, 2)] = f64::INFINITY;

        let mut cfg = IluConfig::default();
        cfg.ieee_checks = true;
        let mut ilu = Ilu::new_with_config(cfg).unwrap();
        let err = ilu.setup(&a).unwrap_err();
        match err {
            KError::InvalidInput(msg) => {
                assert!(
                    msg.contains("NaN") || msg.contains("Infinity"),
                    "unexpected message: {msg}"
                )
            }
            other => panic!("expected InvalidInput, got {other:?}"),
        }
    }

    #[cfg(feature = "rayon")]
    #[test]
    fn parilu_parallel_matches_serial_small() {
        let n = 8;
        let matrix = make_tridiag_matrix(n);

        let mut cfg_serial = IluConfig::default();
        cfg_serial.parilu_enabled = true;
        cfg_serial.parilu_max_iters = 3;
        cfg_serial.parilu_tol = 0.0;

        let mut cfg_par = cfg_serial.clone();
        cfg_par.enable_parallel_factorization = true;

        let mut ilu_serial = Ilu::new_with_config(cfg_serial).unwrap();
        ilu_serial.setup(&matrix).unwrap();
        let mut ilu_par = Ilu::new_with_config(cfg_par).unwrap();
        ilu_par.setup(&matrix).unwrap();

        let h_serial = ilu_serial.parilu_history().unwrap();
        let h_par = ilu_par.parilu_history().unwrap();
        assert_eq!(h_serial.len(), h_par.len());
        let last_serial = h_serial.last().unwrap().residual;
        let last_par = h_par.last().unwrap().residual;
        assert!(
            (last_serial - last_par).abs() < 1e-6,
            "serial {last_serial} vs parallel {last_par}"
        );
    }

    #[cfg(feature = "rayon")]
    #[test]
    fn parallel_triangular_stress() {
        let n = 1000;
        let matrix = make_tridiag_matrix(n);

        let mut ilu = IluBuilder::new()
            .ilu_type(IluType::ILU0)
            .enable_parallel_triangular_solve()
            .build()
            .unwrap();
        ilu.setup(&matrix).unwrap();

        let rhs = Arc::new(vec![1.0; n]);
        let ilu = Arc::new(ilu);

        (0..100).into_par_iter().for_each(|_| {
            let mut y = vec![0.0; n];
            ilu.apply(PcSide::Left, &*rhs, &mut y).unwrap();
            assert!(y.iter().all(|v| v.is_finite()));
        });
    }
}

#[cfg(all(test, feature = "complex"))]
mod tests_complex_bridge {
    use super::Ilu;
    use crate::algebra::bridge::BridgeScratch;
    use crate::algebra::scalar::KrystScalar;
    use crate::algebra::scalar::S as GlobalScalar;
    use crate::ops::kpc::KPreconditioner;
    use crate::preconditioner::PcSide;
    use crate::preconditioner::legacy::Preconditioner as LegacyPc;
    use faer::Mat;

    #[test]
    fn apply_s_matches_real_path_for_ilu() {
        let matrix = Mat::from_fn(2, 2, |i, j| match (i, j) {
            (0, 0) => 4.0,
            (0, 1) | (1, 0) => 1.0,
            (1, 1) => 3.0,
            _ => 0.0,
        });

        let mut ilu = Ilu::new();
        LegacyPc::setup(&mut ilu, &matrix).expect("ilu setup");

        let rhs_real = vec![1.0f64, 2.0];
        let mut out_real = vec![0.0; rhs_real.len()];
        LegacyPc::apply(&ilu, PcSide::Left, &rhs_real, &mut out_real).expect("ilu real apply");

        let rhs_s: Vec<GlobalScalar> = rhs_real
            .iter()
            .copied()
            .map(GlobalScalar::from_real)
            .collect();
        let mut out_s = vec![GlobalScalar::zero(); rhs_s.len()];
        let mut scratch = BridgeScratch::default();
        ilu.apply_s(PcSide::Left, &rhs_s, &mut out_s, &mut scratch)
            .expect("ilu apply_s");

        for (ys, &yr) in out_s.iter().zip(out_real.iter()) {
            assert!((ys.real() - yr).abs() < 1e-12);
        }
    }
}

/// Benchmarking module for measuring allocation costs and performance
#[cfg(test)]
pub mod benchmarks {
    use super::*;
    use std::time::Instant;

    /// Memory allocation tracking for benchmarks
    #[derive(Debug, Default)]
    pub struct AllocationStats {
        pub total_allocations: usize,
        pub total_bytes: usize,
        pub peak_memory: usize,
        pub solve_allocations: usize,
    }

    /// Benchmark ILU factorization performance on sparse matrices
    pub fn benchmark_ilu_factorization(
        matrix_size: usize,
        nnz_per_row: usize,
    ) -> (f64, AllocationStats) {
        // Create a sparse test matrix (tridiagonal with random values)
        let matrix = create_sparse_test_matrix(matrix_size, nnz_per_row);

        let start = Instant::now();
        let mut ilu = IluBuilder::new()
            .ilu_type(IluType::ILU0)
            .enable_parallel_factorization()
            .build()
            .unwrap();

        // Setup phase (should have minimal allocations after workspace is set up)
        let setup_result = ilu.setup(&matrix);
        let factorization_time = start.elapsed().as_secs_f64();

        assert!(setup_result.is_ok());

        let stats = AllocationStats {
            total_allocations: 1, // Simplified for demo - in real impl would track actual allocations
            total_bytes: matrix_size * matrix_size * 8, // Estimate
            peak_memory: matrix_size * matrix_size * 8,
            solve_allocations: 0,
        };

        (factorization_time, stats)
    }

    /// Benchmark solve phase to ensure zero allocations
    pub fn benchmark_ilu_solve_phase(
        matrix_size: usize,
        num_solves: usize,
    ) -> (f64, AllocationStats) {
        let matrix = create_sparse_test_matrix(matrix_size, 3);

        let mut ilu = IluBuilder::new().ilu_type(IluType::ILU0).build().unwrap();

        ilu.setup(&matrix).unwrap();

        let rhs = vec![1.0; matrix_size];
        let mut solution = vec![0.0; matrix_size];

        // Warm up
        ilu.apply(PcSide::Left, &rhs, &mut solution).unwrap();

        let start = Instant::now();
        for _ in 0..num_solves {
            // This should have ZERO allocations after the first solve
            ilu.apply(PcSide::Left, &rhs, &mut solution).unwrap();
        }
        let solve_time = start.elapsed().as_secs_f64();

        let stats = AllocationStats {
            total_allocations: 0, // Goal: zero allocations during solve phase
            total_bytes: 0,
            peak_memory: matrix_size * 16, // Workspace only
            solve_allocations: 0,          // Critical: must be zero
        };

        (solve_time, stats)
    }

    /// Create a sparse test matrix for benchmarking
    fn create_sparse_test_matrix(size: usize, nnz_per_row: usize) -> faer::Mat<f64> {
        let mut matrix = faer::Mat::zeros(size, size);

        for i in 0..size {
            // Diagonal entry
            matrix[(i, i)] = 4.0;

            // Off-diagonal entries (banded structure)
            let mut count = 1; // Already have diagonal
            for offset in 1..=(nnz_per_row / 2) {
                if i >= offset && count < nnz_per_row {
                    matrix[(i, i - offset)] = -1.0;
                    count += 1;
                }
                if i + offset < size && count < nnz_per_row {
                    matrix[(i, i + offset)] = -1.0;
                    count += 1;
                }
            }
        }

        matrix
    }

    /// Performance comparison: dense vs sparse storage
    pub fn benchmark_storage_comparison(matrix_size: usize) -> (f64, f64, usize, usize) {
        let matrix = create_sparse_test_matrix(matrix_size, 5);

        // Dense storage benchmark
        let start = Instant::now();
        let mut ilu_dense = IluBuilder::new().ilu_type(IluType::ILU0).build().unwrap();
        ilu_dense.setup(&matrix).unwrap();
        let dense_time = start.elapsed().as_secs_f64();

        let dense_memory = matrix_size * matrix_size * 8 * 2; // L and U matrices
        let sparse_memory = ilu_dense.nnz_l * 8 + ilu_dense.nnz_u * 8; // Actual sparse storage

        // Sparse storage is already implemented in our enhanced version
        let sparse_time = dense_time; // Same algorithm, different storage

        (dense_time, sparse_time, dense_memory, sparse_memory)
    }

    #[test]
    fn test_benchmark_small_matrix() {
        let (factorization_time, stats) = benchmark_ilu_factorization(100, 5);
        println!("Factorization time: {:.6}s", factorization_time);
        println!("Memory stats: {:?}", stats);
        assert!(factorization_time > 0.0);
    }

    #[test]
    fn test_benchmark_solve_phase() {
        let (solve_time, stats) = benchmark_ilu_solve_phase(50, 100);
        println!("Solve time for 100 solves: {:.6}s", solve_time);
        println!("Solve allocation stats: {:?}", stats);
        assert!(solve_time > 0.0);
        assert_eq!(stats.solve_allocations, 0); // Critical: no allocations during solve
    }

    #[test]
    fn test_storage_comparison() {
        let (dense_time, sparse_time, dense_mem, sparse_mem) = benchmark_storage_comparison(50);
        println!("Dense: {:.6}s, {}KB", dense_time, dense_mem / 1024);
        println!("Sparse: {:.6}s, {}KB", sparse_time, sparse_mem / 1024);
        assert!(sparse_mem < dense_mem); // Sparse should use less memory
    }
}