kryst 3.2.1 - Docs.rs

//! SuperLU_DIST distributed sparse direct solver.
//!
//! This module provides a Rust-native implementation inspired by SuperLU_DIST for solving
//! large sparse linear systems using distributed LU factorization with partial pivoting.
//! It is designed for distributed memory parallel machines and can handle very large sparse
//! systems that would be intractable for serial direct methods.
//!
//! # Key Improvements (Based on Audit Feedback)
//!
//! ## Distributed Matrix Assembly
//! - Added proper global pattern assembly for symbolic factorization
//! - Implemented distributed MMD ordering with global graph construction
//! - Fixed block-cyclic distribution edge cases
//!
//! ## Symbolic Factorization
//! - Replaced naive nested loops with elimination tree reachability analysis
//! - Added proper fill-in estimation using graph algorithms
//! - Improved memory efficiency for large sparse patterns
//!
//! ## Residual Computation
//! - Fixed distributed residual norms with proper MPI reduction framework
//! - Added convergence checks that work correctly in parallel environments
//! - Improved error handling for singular matrices
//!
//! ## Memory Management
//! - Added vector cleanup to prevent unlimited memory growth
//! - Improved temporary vector reuse with size-based optimization
//! - Better handling of workspace allocation patterns
//!
//! # Usage
//! The solver follows the standard Kryst `LinearSolver` interface and is primarily intended
//! for use with distributed sparse matrices in MPI environments. For small to medium problems
//! or serial computation, consider using the dense direct solvers instead.
//!
//! ```rust,ignore
//! use kryst::solver::superlu_dist::{SuperLuDistSolver, SuperLuDistBuilder};
//! use kryst::parallel::UniverseComm;
//!
//! // Create solver with improved options
//! let solver = SuperLuDistBuilder::new()
//!     .with_column_permutation(ColumnPermutation::MmdAta)
//!     .with_distributed_ordering(true)  // Use distributed MMD
//!     .with_diagonal_pivot_threshold(0.01)
//!     .with_iterative_refinement(IterativeRefinement::Double)
//!     .build();
//!
//! // Solve distributed system
//! let mut x = vec![0.0; matrix.nrows()];
//! solver.solve(&matrix, None, &rhs, &mut x, &comm, None, None)?;
//! ```
//!
//! # Implementation Notes
//! This implementation is inspired by HYPRE's SuperLU_DIST wrapper and PETSc's distributed
//! solvers, but adapted for Rust and the Kryst ecosystem. It uses 2D process grids for
//! optimal data distribution and supports various factorization options for different
//! problem types.
//!
//! ## Current Limitations
//! - Panel factorization uses Rust implementation (can be accelerated with BLAS/LAPACK)
//! - 3D factorization and look-ahead algorithms are not yet implemented
//! - ParMETIS integration is stubbed (ready for C library binding)
//!
//! # References
//! - Li, X.S., & Demmel, J.W. (2003). SuperLU_DIST: A scalable distributed-memory sparse direct solver for unsymmetric linear systems. ACM Trans. Math. Softw.
//! - HYPRE SuperLU_DIST interface: hypre_SLUDistSetup, hypre_SLUDistSolve, hypre_SLUDistDestroy
//! - PETSc MATSUPERLU_DIST implementation

use crate::error::KError;
use crate::solver::MonitorCallback;
use crate::matrix::sparse::CsrMatrix;
use crate::parallel::{Comm, UniverseComm, contiguous_partition};
use crate::solver::api::Solver;
use crate::solver::legacy::LinearSolver;
use crate::utils::convergence::{ConvergedReason, SolveStats};
use faer::MatMut;
use faer::prelude::*;
use std::cmp::Ordering;
use std::collections::HashMap;
use mpi::collective::CommunicatorCollectives;

#[cfg(feature = "mpi")]
use mpi::raw::AsRaw;
#[cfg(feature = "mpi")]
use std::ffi::c_void;
#[cfg(feature = "mpi")]
use std::mem::MaybeUninit;

#[cfg(feature = "logging")]
use crate::utils::profiling::StageGuard;

#[allow(dead_code)]
fn validate_local_csr(m: &CsrMatrix<f64>) -> Result<(), KError> {
    let rp = m.row_ptr();
    let cj = m.col_idx();
    let vv = m.values();

    if rp.len() != m.nrows() + 1 {
        return Err(KError::InvalidInput(format!(
            "CSR row_ptr length {} != nrows+1 = {}",
            rp.len(),
            m.nrows() + 1
        )));
    }
    if rp.first().copied() != Some(0) {
        return Err(KError::InvalidInput("CSR row_ptr[0] must be 0".into()));
    }
    for k in 0..m.nrows() {
        if rp[k] > rp[k + 1] {
            return Err(KError::InvalidInput(format!(
                "CSR row_ptr not nondecreasing at row {k}"
            )));
        }
    }
    let nnz = *rp.last().unwrap();
    if nnz != cj.len() || nnz != vv.len() {
        return Err(KError::InvalidInput(format!(
            "CSR nnz mismatch: row_ptr last={}, col_idx={}, values={}",
            nnz,
            cj.len(),
            vv.len()
        )));
    }
    let ncols = m.ncols();
    for i in 0..m.nrows() {
        for p in rp[i]..rp[i + 1] {
            if cj[p] >= ncols {
                return Err(KError::InvalidInput(format!(
                    "CSR col index {} out of range (ncols={}) at local row {} pos {}",
                    cj[p], ncols, i, p
                )));
            }
        }
    }
    Ok(())
}

/// 2D process grid for distributed SuperLU operations
#[derive(Debug)]
pub struct ProcessGrid {
    /// Number of process rows in the grid
    pub prows: usize,
    /// Number of process columns in the grid
    pub pcols: usize,
    /// Current process row index (0 to prows-1)
    pub my_prow: usize,
    /// Current process column index (0 to pcols-1)
    pub my_pcol: usize,
    /// Global rank of this process
    pub my_rank: usize,
    /// Total number of processes
    pub total_procs: usize,
}

impl Clone for ProcessGrid {
    fn clone(&self) -> Self {
        Self {
            prows: self.prows,
            pcols: self.pcols,
            my_prow: self.my_prow,
            my_pcol: self.my_pcol,
            my_rank: self.my_rank,
            total_procs: self.total_procs,
        }
    }
}

impl ProcessGrid {
    /// Create a new process grid from communicator with automatic dimension selection
    pub fn new_auto(comm: &UniverseComm) -> Result<Self, KError> {
        let total_procs = comm.size();
        if total_procs == 0 {
            return Err(KError::InvalidInput(
                "Process grid requires at least one process".into(),
            ));
        }
        let (prows, pcols) = Self::determine_optimal_grid(total_procs);
        Self::new_with_dims(comm, prows, pcols)
    }

    /// Create a new process grid with specified dimensions
    pub fn new_with_dims(comm: &UniverseComm, prows: usize, pcols: usize) -> Result<Self, KError> {
        let total_procs = comm.size();
        let my_rank = comm.rank();

        if prows * pcols != total_procs {
            return Err(KError::InvalidInput(format!(
                "Process grid {prows}x{pcols} doesn't match MPI size {total_procs}"
            )));
        }

        // Calculate my position in the grid
        let my_prow = my_rank / pcols;
        let my_pcol = my_rank % pcols;

        Ok(ProcessGrid {
            prows,
            pcols,
            my_prow,
            my_pcol,
            my_rank,
            total_procs,
        })
    }

    /// Determine optimal process grid dimensions
    #[allow(dead_code)]
    fn determine_optimal_grid(size: usize) -> (usize, usize) {
        // Find prows and pcols such that prows * pcols = size
        // and the grid is as square as possible
        let mut prows = (size as f64).sqrt().floor() as usize;
        while prows > 0 && size % prows != 0 {
            prows -= 1;
        }
        let pcols = size / prows;
        (prows, pcols)
    }

    /// Convert linear rank to (prow, pcol) coordinates
    pub fn rank_to_coords(&self, rank: usize) -> (usize, usize) {
        (rank / self.pcols, rank % self.pcols)
    }

    /// Convert (prow, pcol) coordinates to linear rank
    pub fn coords_to_rank(&self, prow: usize, pcol: usize) -> usize {
        prow * self.pcols + pcol
    }

    /// Check if this process owns a global row
    pub fn owns_global_row(&self, global_row: usize, block_size: usize) -> bool {
        let block_row = global_row / block_size;
        block_row % self.prows == self.my_prow
    }

    /// Check if this process owns a global column
    pub fn owns_global_col(&self, global_col: usize, block_size: usize) -> bool {
        let block_col = global_col / block_size;
        block_col % self.pcols == self.my_pcol
    }
}

#[cfg(feature = "superlu3d")]
/// Simple 3D process grid built from stacking 2D grids
#[derive(Debug, Clone)]
pub struct ProcessGrid3D {
    pub prows: usize,
    pub pcols: usize,
    pub pdepth: usize,
    pub my_prow: usize,
    pub my_pcol: usize,
    pub my_pdepth: usize,
    pub my_rank: usize,
    pub total_procs: usize,
}

#[cfg(feature = "superlu3d")]
impl ProcessGrid3D {
    pub fn from_2d_with_depth(g2d: &ProcessGrid, depth: usize) -> Result<Self, KError> {
        let total = g2d.total_procs;
        if depth == 0 || total % depth != 0 {
            return Err(KError::InvalidInput(format!("invalid 3D depth {depth}")));
        }
        let layer_size = total / depth;
        let my_layer = g2d.my_rank / layer_size;
        let my_inlayer = g2d.my_rank % layer_size;
        let prow = my_inlayer / g2d.pcols;
        let pcol = my_inlayer % g2d.pcols;
        Ok(Self {
            prows: g2d.prows,
            pcols: g2d.pcols,
            pdepth: depth,
            my_prow: prow,
            my_pcol: pcol,
            my_pdepth: my_layer,
            my_rank: g2d.my_rank,
            total_procs: total,
        })
    }

    #[inline]
    pub fn coords_to_rank(&self, prow: usize, pcol: usize, pdepth: usize) -> usize {
        let layer_size = self.prows * self.pcols;
        pdepth * layer_size + (prow * self.pcols + pcol)
    }
}

/// Block-cyclic matrix distribution for SuperLU_DIST
#[derive(Debug, Clone)]
pub struct BlockCyclicDistribution {
    /// Process grid
    pub grid: ProcessGrid,
    /// Block size for row distribution
    pub row_block_size: usize,
    /// Block size for column distribution
    pub col_block_size: usize,
    /// Global matrix dimensions
    pub global_rows: usize,
    pub global_cols: usize,
    /// Local matrix dimensions on this process
    pub local_rows: usize,
    pub local_cols: usize,
}

impl BlockCyclicDistribution {
    /// Create new block-cyclic distribution
    pub fn new(
        grid: ProcessGrid,
        global_rows: usize,
        global_cols: usize,
        row_block_size: usize,
        col_block_size: usize,
    ) -> Self {
        // Calculate local dimensions
        let local_rows =
            Self::calculate_local_dimension(global_rows, row_block_size, grid.prows, grid.my_prow);
        let local_cols =
            Self::calculate_local_dimension(global_cols, col_block_size, grid.pcols, grid.my_pcol);

        Self {
            grid,
            row_block_size,
            col_block_size,
            global_rows,
            global_cols,
            local_rows,
            local_cols,
        }
    }

    /// Number of row blocks for an external block size
    pub fn n_row_blocks(&self, block_size: usize) -> usize {
        self.global_rows.div_ceil(block_size)
    }

    /// Number of column blocks for an external block size
    pub fn n_col_blocks(&self, block_size: usize) -> usize {
        self.global_cols.div_ceil(block_size)
    }

    /// Owner rank of a given (row-block, col-block)
    pub fn owner_rank_of_block(&self, brow: usize, bcol: usize) -> usize {
        let prow = brow % self.grid.prows;
        let pcol = bcol % self.grid.pcols;
        self.grid.coords_to_rank(prow, pcol)
    }

    /// Owner rank of diagonal block (k, k)
    pub fn owner_rank_of_diag_block(&self, k: usize) -> usize {
        self.owner_rank_of_block(k, k)
    }

    /// Local row count for a given rank in the 2D grid.
    pub fn local_rows_for_rank(&self, rank: usize) -> usize {
        let (prow, _) = self.grid.rank_to_coords(rank);
        Self::calculate_local_dimension(
            self.global_rows,
            self.row_block_size,
            self.grid.prows,
            prow,
        )
    }

    /// Local column count for a given rank in the 2D grid.
    pub fn local_cols_for_rank(&self, rank: usize) -> usize {
        let (_, pcol) = self.grid.rank_to_coords(rank);
        Self::calculate_local_dimension(
            self.global_cols,
            self.col_block_size,
            self.grid.pcols,
            pcol,
        )
    }

    /// Calculate local dimension for block-cyclic distribution
    /// Fixed to handle edge cases correctly
    fn calculate_local_dimension(
        global_dim: usize,
        block_size: usize,
        proc_dim: usize,
        my_proc: usize,
    ) -> usize {
        if global_dim == 0 {
            return 0;
        }

        let num_blocks = global_dim.div_ceil(block_size);
        let blocks_per_proc = num_blocks / proc_dim;
        let extra_blocks = num_blocks % proc_dim;

        let my_blocks = blocks_per_proc + if my_proc < extra_blocks { 1 } else { 0 };

        if my_blocks == 0 {
            return 0;
        }

        // Calculate the starting global index for this process
        let my_start_block = my_proc * blocks_per_proc + std::cmp::min(my_proc, extra_blocks);
        let my_end_block = my_start_block + my_blocks;

        // Handle partial last block correctly
        let my_start_idx = my_start_block * block_size;
        let my_end_idx = std::cmp::min((my_end_block - 1) * block_size + block_size, global_dim);

        my_end_idx - my_start_idx
    }

    /// Convert global row index to local row index
    pub fn global_to_local_row(&self, global_row: usize) -> Option<usize> {
        let block_id = global_row / self.row_block_size;
        let block_offset = global_row % self.row_block_size;
        let owner_proc = block_id % self.grid.prows;

        if owner_proc == self.grid.my_prow {
            let local_block_id = block_id / self.grid.prows;
            Some(local_block_id * self.row_block_size + block_offset)
        } else {
            None
        }
    }

    /// Convert global column index to local column index
    pub fn global_to_local_col(&self, global_col: usize) -> Option<usize> {
        let block_id = global_col / self.col_block_size;
        let block_offset = global_col % self.col_block_size;
        let owner_proc = block_id % self.grid.pcols;

        if owner_proc == self.grid.my_pcol {
            let local_block_id = block_id / self.grid.pcols;
            Some(local_block_id * self.col_block_size + block_offset)
        } else {
            None
        }
    }

    /// Convert local row index to global row index
    pub fn local_to_global_row(&self, local_row: usize) -> usize {
        let local_block_id = local_row / self.row_block_size;
        let block_offset = local_row % self.row_block_size;
        let global_block_id = local_block_id * self.grid.prows + self.grid.my_prow;
        global_block_id * self.row_block_size + block_offset
    }

    /// Convert local column index to global column index
    pub fn local_to_global_col(&self, local_col: usize) -> usize {
        let local_block_id = local_col / self.col_block_size;
        let block_offset = local_col % self.col_block_size;
        let global_block_id = local_block_id * self.grid.pcols + self.grid.my_pcol;
        global_block_id * self.col_block_size + block_offset
    }

    #[inline]
    pub fn row_block_of(&self, global_row: usize) -> usize {
        global_row / self.row_block_size
    }

    #[inline]
    pub fn col_block_of(&self, global_col: usize) -> usize {
        global_col / self.col_block_size
    }

    /// (prow, pcol) that owns (i,j) in block-cyclic sense
    #[inline]
    pub fn owner_coords_of(&self, global_row: usize, global_col: usize) -> (usize, usize) {
        let br = self.row_block_of(global_row) % self.grid.prows;
        let bc = self.col_block_of(global_col) % self.grid.pcols;
        (br, bc)
    }

    /// Global rank that owns (i,j)
    #[inline]
    pub fn owner_of(&self, global_row: usize, global_col: usize) -> usize {
        let (pr, pc) = self.owner_coords_of(global_row, global_col);
        self.grid.coords_to_rank(pr, pc)
    }

    /// Local row index if this proc owns `global_row`, else None
    #[inline]
    pub fn local_row_from_global(&self, global_row: usize) -> Option<usize> {
        if !self.owns_global_row(global_row, self.row_block_size) {
            return None;
        }
        let block_id_g = self.row_block_of(global_row);
        let block_id_l = block_id_g / self.grid.prows;
        let offset_in_block = global_row % self.row_block_size;
        Some(block_id_l * self.row_block_size + offset_in_block)
    }

    /// Local col index if this proc owns `global_col`, else None
    #[inline]
    pub fn local_col_from_global(&self, global_col: usize) -> Option<usize> {
        if !self.owns_global_col(global_col, self.col_block_size) {
            return None;
        }
        let block_id_g = self.col_block_of(global_col);
        let block_id_l = block_id_g / self.grid.pcols;
        let offset_in_block = global_col % self.col_block_size;
        Some(block_id_l * self.col_block_size + offset_in_block)
    }

    #[inline]
    pub fn owns_global_row(&self, global_row: usize, block_size: usize) -> bool {
        self.grid.owns_global_row(global_row, block_size)
    }

    #[inline]
    pub fn owns_global_col(&self, global_col: usize, block_size: usize) -> bool {
        self.grid.owns_global_col(global_col, block_size)
    }
}

#[cfg(test)]
mod dist_tests {
    use super::*;

    fn make_grid(total: usize, prows: usize, pcols: usize, my_rank: usize) -> ProcessGrid {
        ProcessGrid {
            prows,
            pcols,
            my_prow: my_rank / pcols,
            my_pcol: my_rank % pcols,
            my_rank,
            total_procs: total,
        }
    }

    #[test]
    fn roundtrip_owned_indices() {
        let cases = [
            (5, 7, (2, 2), (2, 3)),  // non-square, block < n
            (0, 0, (1, 1), (2, 2)),  // empty
            (8, 8, (2, 3), (3, 2)),  // wider than tall
            (17, 9, (3, 1), (4, 4)), // tall, skinny grid
        ];
        for &(nr, nc, (pr, pc), (br, bc)) in &cases {
            let total = pr * pc;
            for rank in 0..total {
                let grid = make_grid(total, pr, pc, rank);
                let dist = BlockCyclicDistribution::new(grid.clone(), nr, nc, br.max(1), bc.max(1));
                for i in 0..nr {
                    let owns = dist.owns_global_row(i, dist.row_block_size);
                    match (owns, dist.local_row_from_global(i)) {
                        (true, Some(loc)) => {
                            let back = dist.local_to_global_row(loc);
                            assert_eq!(back, i, "row round-trip failed (i={i}, rank={rank:?})");
                        }
                        (false, None) => {}
                        other => panic!("inconsistent row ownership: {other:?}"),
                    }
                }
                for j in 0..nc {
                    let owns = dist.owns_global_col(j, dist.col_block_size);
                    match (owns, dist.local_col_from_global(j)) {
                        (true, Some(loc)) => {
                            let back = dist.local_to_global_col(loc);
                            assert_eq!(back, j, "col round-trip failed (j={j}, rank={rank:?})");
                        }
                        (false, None) => {}
                        other => panic!("inconsistent col ownership: {other:?}"),
                    }
                }
            }
        }
    }

    #[test]
    fn owner_rank_agrees_with_coords() {
        let grid = ProcessGrid {
            prows: 2,
            pcols: 3,
            my_prow: 0,
            my_pcol: 0,
            my_rank: 0,
            total_procs: 6,
        };
        let dist = BlockCyclicDistribution::new(grid, 10, 11, 4, 3);
        for i in 0..10 {
            for j in 0..11 {
                let (pr, pc) = dist.owner_coords_of(i, j);
                assert_eq!(dist.owner_of(i, j), pr * dist.grid.pcols + pc);
            }
        }
    }
}

/// Pivoting strategies for SuperLU_DIST
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PivotingStrategy {
    /// Dynamic pivoting with threshold
    Dynamic,
    /// Static pivoting (no row interchanges)
    Static,
    /// Threshold pivoting with fallback
    ThresholdWithFallback,
}

/// Panel structure for local dense factorization
#[derive(Debug, Clone)]
pub struct Panel {
    /// Panel width (number of columns)
    pub width: usize,
    /// Panel height (number of rows)
    pub height: usize,
    /// Dense matrix data (column-major)
    pub data: Vec<f64>,
    /// Row indices for sparse structure
    pub row_indices: Vec<usize>,
    /// Column start positions
    pub col_start: usize,
}

impl Panel {
    /// Create a new panel from sparse matrix columns
    pub fn from_sparse_columns(
        matrix: &CsrMatrix<f64>,
        col_start: usize,
        col_end: usize,
        row_indices: Vec<usize>,
    ) -> Self {
        let width = col_end - col_start;
        let height = row_indices.len();
        let mut data = vec![0.0; width * height];

        let row_ptrs = matrix.row_ptr();
        let col_indices = matrix.col_idx();
        let values = matrix.values();

        // Extract dense panel from sparse matrix
        for (local_row, &global_row) in row_indices.iter().enumerate() {
            let start = row_ptrs[global_row];
            let end = row_ptrs[global_row + 1];

            for idx in start..end {
                let col = col_indices[idx];
                if col >= col_start && col < col_end {
                    let local_col = col - col_start;
                    // Column-major storage
                    data[local_col * height + local_row] = values[idx];
                }
            }
        }
        debug_assert_eq!(width, col_end - col_start);
        debug_assert_eq!(height, row_indices.len());

        Self {
            width,
            height,
            data,
            row_indices,
            col_start,
        }
    }

    /// Get mutable view as faer matrix
    pub fn as_faer_mut(&mut self) -> MatMut<'_, f64> {
        MatMut::from_column_major_slice_mut(&mut self.data, self.height, self.width)
    }

    /// Get view as faer matrix (using reference)
    pub fn as_faer(&self) -> faer::MatRef<'_, f64> {
        faer::MatRef::from_column_major_slice(&self.data, self.height, self.width)
    }

    /// Apply LU factorization to the panel using blocked Gaussian elimination with BLAS updates
    pub fn factorize_lu(
        &mut self,
        threshold: f64,
        pivot_strategy: PivotingStrategy,
    ) -> Result<PanelFactorization, KError> {
        let m = self.height;
        let n = self.width;
        let mut tiny_pivots_replaced = 0usize;

        // Column blocking for cache-friendly updates
        let kb = core::cmp::min(64, n.max(1));

        let mut row_perm: Vec<usize> = (0..m).collect();
        let mut num_row_swaps = 0usize;
        let mut is_singular = false;
        let mut pivot_strategy = pivot_strategy; // may change if fallback occurs

        let mut a = self.as_faer_mut();
        let mut j = 0;
        while j < n {
            let jb = core::cmp::min(kb, n - j);

            // Factor current block column by column
            for col in 0..jb {
                let gcol = j + col;

                // --- pivot search based on strategy ---
                let mut piv = gcol;
                let mut max_val = a[(gcol, gcol)].abs();
                match pivot_strategy {
                    PivotingStrategy::Static => {}
                    PivotingStrategy::Dynamic => {
                        for r in gcol..m {
                            let val = a[(r, gcol)].abs();
                            if val > max_val {
                                max_val = val;
                                piv = r;
                            }
                        }
                    }
                    PivotingStrategy::ThresholdWithFallback => {
                        if max_val < threshold {
                            // Fallback to dynamic pivoting
                            pivot_strategy = PivotingStrategy::Dynamic;
                            for r in gcol..m {
                                let val = a[(r, gcol)].abs();
                                if val > max_val {
                                    max_val = val;
                                    piv = r;
                                }
                            }
                        }
                    }
                }

                if max_val < threshold {
                    let old = a[(gcol, gcol)];
                    tiny_pivots_replaced += 1;
                    is_singular = true;
                    a[(gcol, gcol)] = if old == 0.0 {
                        threshold
                    } else {
                        threshold.copysign(old)
                    };
                }

                // --- row interchange if needed ---
                if pivot_strategy != PivotingStrategy::Static && piv != gcol {
                    for c in j..n {
                        let t = a[(gcol, c)];
                        a[(gcol, c)] = a[(piv, c)];
                        a[(piv, c)] = t;
                    }
                    row_perm.swap(gcol, piv);
                    num_row_swaps += 1;
                }

                // --- compute multipliers below the diagonal ---
                let diag = a[(gcol, gcol)];
                if diag != 0.0 {
                    for r in (gcol + 1)..m {
                        a[(r, gcol)] /= diag;
                    }
                }
            }

            // 2) TRSM: apply L^{-1} to the right block for the current jb block
            let right_cols = n - (j + jb);
            if n > j {
                #[allow(unused_mut)]
                let mut sub = a.rb_mut().submatrix_mut(j, j, m - j, n - j);
                #[allow(unused_mut)]
                let (mut l_block_and_l21, mut right) = sub.split_at_col_mut(jb);

                if right_cols > 0 {
                    faer::linalg::triangular_solve::solve_unit_lower_triangular_in_place(
                        l_block_and_l21.rb(),
                        right.rb_mut(),
                        faer::Par::Seq,
                    );
                }

                // 3) GEMM: trailing update
                if (m > j + jb) && (n > j + jb) {
                    #[allow(unused_mut)]
                    let (_, mut l21) = l_block_and_l21.split_at_row_mut(jb);
                    #[allow(unused_mut)]
                    let (mut u12, mut trailing) = right.split_at_row_mut(jb);
                    faer::linalg::matmul::matmul(
                        trailing.rb_mut(),
                        faer::Accum::Add,
                        l21.rb(),
                        u12.rb(),
                        -1.0,
                        faer::Par::Seq,
                    );
                }
            }

            j += jb;
        }

        Ok(PanelFactorization {
            row_permutation: row_perm,
            pivot_strategy,
            diagonal_threshold: threshold,
            num_row_swaps,
            is_singular,
            tiny_pivots_replaced,
        })
    }
}

/// Result of panel factorization
#[derive(Debug, Clone)]
pub struct PanelFactorization {
    /// Row permutation from pivoting
    pub row_permutation: Vec<usize>,
    /// Pivoting strategy used
    pub pivot_strategy: PivotingStrategy,
    /// Diagonal threshold used
    pub diagonal_threshold: f64,
    /// Number of row swaps performed
    pub num_row_swaps: usize,
    /// Whether matrix was detected as singular
    pub is_singular: bool,
    /// Number of tiny pivots that were replaced during factorization
    pub tiny_pivots_replaced: usize,
}

/// Enhanced numerical factorization data
#[derive(Debug, Clone)]
pub struct NumericFactorization {
    /// Matrix dimension
    pub n: usize,
    /// Total number of nonzeros in L and U
    pub nnz: usize,
    /// Factorized panels
    pub panels: Vec<Panel>,
    /// Panel factorization results
    pub panel_factors: Vec<PanelFactorization>,
    /// Global row permutation from pivoting
    pub global_row_perm: Vec<usize>,
    /// Global column permutation
    pub global_col_perm: Vec<usize>,
    /// Row scaling factors
    pub row_scale: Vec<f64>,
    /// Column scaling factors
    pub col_scale: Vec<f64>,
    /// Pivoting strategy used
    pub pivot_strategy: PivotingStrategy,
    /// Diagonal pivot threshold
    pub pivot_threshold: f64,
    /// Whether tiny pivots were replaced
    pub replaced_tiny_pivots: bool,
    /// Factorization statistics
    pub factor_stats: FactorizationStats,
    /// Block dependency graph for forward solve (L)
    pub l_block_graph: Vec<Vec<usize>>,
    /// Block dependency graph for backward solve (U)
    pub u_block_graph: Vec<Vec<usize>>,
}

/// Statistics from numerical factorization
#[derive(Debug, Clone)]
pub struct FactorizationStats {
    /// Number of panels processed
    pub num_panels: usize,
    /// Total number of row swaps
    pub total_row_swaps: usize,
    /// Number of tiny pivots replaced
    pub tiny_pivots_replaced: usize,
    /// Maximum pivot growth factor
    pub max_pivot_growth: f64,
    /// Condition number estimate
    pub condition_estimate: Option<f64>,
    /// Memory usage in bytes
    pub memory_usage: usize,
}

/// Communication pattern for distributed triangular solve
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CommPattern {
    /// Point-to-point communication
    PointToPoint,
    /// Binary tree reduction
    BinaryTree,
    /// Ring communication
    Ring,
    /// Butterfly pattern
    Butterfly,
}

/// Real MPI communication request for distributed operations
#[derive(Debug)]
pub struct CommRequest {
    /// Request ID for tracking
    pub request_id: usize,
    /// Source process rank
    pub source_rank: usize,
    /// Destination process rank  
    pub dest_rank: usize,
    /// Message tag
    pub tag: usize,
    /// Communication type
    pub comm_type: CommType,
    /// Data buffer size
    pub buffer_size: usize,
    /// Completion status
    pub completed: bool,
    /// Error status
    pub error: Option<String>,
}

#[derive(Debug)]
struct PendingComm {
    meta: CommRequest,
    #[cfg(feature = "mpi")]
    handle: mpi::ffi::MPI_Request,
    buffer: Option<Vec<f64>>,
}

impl CommRequest {
    /// Create a new communication request
    pub fn new(
        request_id: usize,
        source_rank: usize,
        dest_rank: usize,
        tag: usize,
        comm_type: CommType,
        buffer_size: usize,
    ) -> Self {
        Self {
            request_id,
            source_rank,
            dest_rank,
            tag,
            comm_type,
            buffer_size,
            completed: false,
            error: None,
        }
    }

    /// Check if the request is completed
    pub fn is_completed(&self) -> bool {
        self.completed
    }

    /// Mark the request as completed
    pub fn mark_completed(&mut self) {
        self.completed = true;
    }

    /// Set error status
    pub fn set_error(&mut self, error: String) {
        self.error = Some(error);
        self.completed = true;
    }
}

/// Type of communication operation
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CommType {
    /// Send operation
    Send,
    /// Receive operation
    Recv,
    /// Broadcast operation
    Broadcast,
    /// Reduce operation
    Reduce,
    /// All-reduce operation
    AllReduce,
}

/// Block-cyclic triangular solve data structure
#[derive(Debug)]
pub struct TriangularSolveData {
    /// Local solution vector blocks
    pub local_solution_blocks: Vec<Vec<f64>>,
    /// Communication buffer for receiving data
    pub comm_buffer: Vec<f64>,
    /// Pending nonblocking requests
    pending_requests: Vec<PendingComm>,
    /// Block ownership mapping
    pub block_owners: Vec<usize>,
    /// Exact sizes for each diagonal block
    pub block_sizes: Vec<usize>,
    /// Local dense triangular factors
    pub local_l_factors: Vec<Panel>,
    /// Local dense triangular factors (U)
    pub local_u_factors: Vec<Panel>,
    /// Block dependency graph for scheduling
    pub dependency_graph: Vec<Vec<usize>>,
}

impl TriangularSolveData {
    /// Create new triangular solve data structure
    pub fn new(
        n: usize,
        block_size: usize,
        distribution: &BlockCyclicDistribution,
        numeric_factor: &NumericFactorization,
        deps: Vec<Vec<usize>>,
    ) -> Self {
        // Number of diagonal blocks to process
        let num_blocks = distribution.n_col_blocks(block_size);

        // Exact per-block sizes
        let mut block_sizes = vec![block_size; num_blocks];
        if num_blocks > 0 {
            let rem = n % block_size;
            if rem != 0 {
                block_sizes[num_blocks - 1] = rem;
            }
        }

        // Owner rank for each diagonal block
        let mut block_owners = vec![0; num_blocks];
        for k in 0..num_blocks {
            block_owners[k] = distribution.owner_rank_of_diag_block(k);
        }

        // Allocate local solution blocks for blocks owned by this rank
        let mut local_solution_blocks = Vec::new();
        for k in 0..num_blocks {
            if block_owners[k] == distribution.grid.my_rank {
                local_solution_blocks.push(vec![0.0; block_sizes[k]]);
            }
        }

        let dependency_graph = if deps.len() == num_blocks {
            deps
        } else {
            vec![Vec::new(); num_blocks]
        };

        Self {
            local_solution_blocks,
            comm_buffer: vec![0.0; block_size * distribution.grid.total_procs],
            pending_requests: Vec::new(),
            block_owners,
            block_sizes,
            local_l_factors: numeric_factor.panels.clone(),
            local_u_factors: numeric_factor.panels.clone(), // Simplified - would separate L and U
            dependency_graph,
        }
    }

    /// Start nonblocking send operation
    pub fn isend(
        &mut self,
        data: &[f64],
        dest_rank: usize,
        tag: usize,
        request_id: usize,
        comm: &UniverseComm,
    ) -> Result<(), KError> {
        let mut request = CommRequest::new(
            request_id,
            comm.rank(), // source is this process
            dest_rank,
            tag,
            CommType::Send,
            data.len(),
        );

        if comm.size() <= 1 {
            request.mark_completed();
            self.pending_requests.push(PendingComm {
                meta: request,
                #[cfg(feature = "mpi")]
                handle: unsafe { mpi::ffi::RSMPI_REQUEST_NULL },
                buffer: None,
            });
            return Ok(());
        }

        #[cfg(feature = "mpi")]
        if let Some(world) = comm.as_mpi() {
            let buffer = data.to_vec();
            let mut handle = MaybeUninit::<mpi::ffi::MPI_Request>::uninit();
            let rc = unsafe {
                mpi::ffi::MPI_Isend(
                    buffer.as_ptr() as *const c_void,
                    buffer.len() as i32,
                    mpi::ffi::RSMPI_DOUBLE,
                    dest_rank as i32,
                    tag as i32,
                    world.as_raw(),
                    handle.as_mut_ptr(),
                )
            };
            if rc != 0 {
                request.set_error(format!("MPI_Isend failed with code {rc}"));
                self.pending_requests.push(PendingComm {
                    meta: request,
                    handle: unsafe { mpi::ffi::RSMPI_REQUEST_NULL },
                    buffer: None,
                });
                return Err(KError::SolveError(format!(
                    "MPI_Isend failed with code {rc}"
                )));
            }
            let handle = unsafe { handle.assume_init() };
            self.pending_requests.push(PendingComm {
                meta: request,
                handle,
                buffer: Some(buffer),
            });
            return Ok(());
        }

        Err(KError::InvalidInput(
            "MPI communicator required for nonblocking send".into(),
        ))
    }

    /// Start nonblocking receive operation
    pub fn irecv(
        &mut self,
        buffer_size: usize,
        source_rank: usize,
        tag: usize,
        request_id: usize,
        comm: &UniverseComm,
    ) -> Result<(), KError> {
        let mut request = CommRequest::new(
            request_id,
            source_rank,
            comm.rank(), // dest is this process
            tag,
            CommType::Recv,
            buffer_size,
        );

        if comm.size() <= 1 {
            request.mark_completed();
            self.pending_requests.push(PendingComm {
                meta: request,
                #[cfg(feature = "mpi")]
                handle: unsafe { mpi::ffi::RSMPI_REQUEST_NULL },
                buffer: None,
            });
            return Ok(());
        }

        #[cfg(feature = "mpi")]
        if let Some(world) = comm.as_mpi() {
            let mut buffer = vec![0.0f64; buffer_size];
            let mut handle = MaybeUninit::<mpi::ffi::MPI_Request>::uninit();
            let rc = unsafe {
                mpi::ffi::MPI_Irecv(
                    buffer.as_mut_ptr() as *mut c_void,
                    buffer.len() as i32,
                    mpi::ffi::RSMPI_DOUBLE,
                    source_rank as i32,
                    tag as i32,
                    world.as_raw(),
                    handle.as_mut_ptr(),
                )
            };
            if rc != 0 {
                request.set_error(format!("MPI_Irecv failed with code {rc}"));
                self.pending_requests.push(PendingComm {
                    meta: request,
                    handle: unsafe { mpi::ffi::RSMPI_REQUEST_NULL },
                    buffer: None,
                });
                return Err(KError::SolveError(format!(
                    "MPI_Irecv failed with code {rc}"
                )));
            }
            let handle = unsafe { handle.assume_init() };
            self.pending_requests.push(PendingComm {
                meta: request,
                handle,
                buffer: Some(buffer),
            });
            return Ok(());
        }

        Err(KError::InvalidInput(
            "MPI communicator required for nonblocking receive".into(),
        ))
    }

    /// Wait for completion of specific request
    pub fn wait(
        &mut self,
        request_id: usize,
        target: Option<&mut [f64]>,
    ) -> Result<(), KError> {
        if let Some(index) = self
            .pending_requests
            .iter()
            .position(|req| req.meta.request_id == request_id)
        {
            let mut req = self.pending_requests.remove(index);
            #[cfg(feature = "mpi")]
            {
                if req.meta.completed {
                    return Ok(());
                }
                let rc = unsafe { mpi::ffi::MPI_Wait(&mut req.handle, mpi::ffi::RSMPI_STATUS_IGNORE) };
                if rc != 0 {
                    return Err(KError::SolveError(format!(
                        "MPI_Wait failed with code {rc}"
                    )));
                }
            }
            if req.meta.comm_type == CommType::Recv {
                if let Some(buffer) = req.buffer.take() {
                    self.comm_buffer.resize(buffer.len(), 0.0);
                    self.comm_buffer.copy_from_slice(&buffer);
                    if let Some(target) = target {
                        target.copy_from_slice(&buffer);
                    }
                }
            }
            req.meta.mark_completed();
        }

        #[cfg(feature = "logging")]
        log::debug!("Completed communication request {request_id}");

        Ok(())
    }

    /// Test for completion of specific request without blocking
    pub fn test(&self, _request_id: usize) -> bool {
        // In real implementation, this would call MPI_Test
        // For simulation, assume requests complete quickly
        true
    }

    /// Wait for all pending requests to complete
    pub fn wait_all(&mut self) -> Result<(), KError> {
        #[cfg(feature = "logging")]
        log::debug!(
            "Waiting for {} pending requests",
            self.pending_requests.len()
        );

        let mut pending = Vec::new();
        std::mem::swap(&mut pending, &mut self.pending_requests);
        for mut req in pending {
            #[cfg(feature = "mpi")]
            {
                if !req.meta.completed {
                    let rc =
                        unsafe { mpi::ffi::MPI_Wait(&mut req.handle, mpi::ffi::RSMPI_STATUS_IGNORE) };
                    if rc != 0 {
                        return Err(KError::SolveError(format!(
                            "MPI_Wait failed with code {rc}"
                        )));
                    }
                }
            }
            if req.meta.comm_type == CommType::Recv {
                if let Some(buffer) = req.buffer.take() {
                    self.comm_buffer.resize(buffer.len(), 0.0);
                    self.comm_buffer.copy_from_slice(&buffer);
                }
            }
        }
        Ok(())
    }
}

/// Distributed triangular solver implementation
pub struct DistributedTriangularSolver;

impl DistributedTriangularSolver {
    /// Perform distributed forward substitution (solve Ly = b)
    pub fn forward_solve(
        b: &[f64],
        x: &mut [f64],
        numeric_factor: &NumericFactorization,
        distribution: &BlockCyclicDistribution,
        comm: &UniverseComm,
        comm_pattern: CommPattern,
        overlap_comm: bool,
        #[cfg(feature = "superlu3d")] grid3d: Option<&ProcessGrid3D>,
    ) -> Result<(), KError> {
        #[cfg(feature = "logging")]
        let _guard = StageGuard::new("DistributedForwardSolve");

        let n = b.len();
        if n == 0 {
            return Ok(());
        }
        let block_size = 64; // Could be made configurable
        let num_blocks = n.div_ceil(block_size);

        #[cfg(feature = "logging")]
        log::debug!("Starting forward solve: n={n}, blocks={num_blocks}, pattern={comm_pattern:?}");

        // Initialize solve data structure
        let mut solve_data = TriangularSolveData::new(
            n,
            block_size,
            distribution,
            numeric_factor,
            numeric_factor.l_block_graph.clone(),
        );

        // Copy RHS to solution vector
        x.copy_from_slice(b);

        // Process blocks in dependency order
        for block_id in 0..num_blocks {
            let block_start = block_id * block_size;
            let current_block_size = solve_data.block_sizes[block_id];
            let block_end = block_start + current_block_size;

            // Check if this process owns the current block
            if solve_data.block_owners[block_id] == distribution.grid.my_rank {
                // Perform local dense triangular solve for this block
                Self::solve_local_l_block(
                    &mut x[block_start..block_end],
                    &solve_data.local_l_factors,
                    block_id,
                )?;

                if overlap_comm {
                    // Start nonblocking broadcasts to other processes that need this block
                    Self::start_nonblocking_broadcast(
                        &mut solve_data,
                        &x[block_start..block_end],
                        block_id,
                        distribution,
                        comm_pattern,
                        comm,
                        #[cfg(feature = "superlu3d")]
                        grid3d,
                    )?;
                }
            } else if overlap_comm {
                // Start nonblocking receive for this block
                let owner_rank = solve_data.block_owners[block_id];
                solve_data.irecv(current_block_size, owner_rank, block_id, block_id, comm)?;
                solve_data.wait(block_id, Some(&mut x[block_start..block_end]))?;
            }

            // Apply updates from previously solved blocks
            let dependency_blocks = solve_data.dependency_graph[block_id].clone();
            for dep_block in dependency_blocks {
                if solve_data.block_owners[dep_block] != distribution.grid.my_rank {
                    // Wait for dependency to arrive
                    if overlap_comm {
                        solve_data.wait(dep_block, None)?;
                    }

                    // Apply update from dependency block
                    Self::apply_block_update(
                        &mut x[block_start..block_end],
                        &solve_data.comm_buffer,
                        dep_block,
                        block_id,
                        &solve_data.local_l_factors,
                    )?;
                }
            }

            if !overlap_comm {
                // Synchronous broadcast of solution block
                Self::synchronous_broadcast(
                    &mut x[block_start..block_end],
                    solve_data.block_owners[block_id],
                    block_id,
                    comm,
                    comm_pattern,
                    #[cfg(feature = "superlu3d")]
                    grid3d,
                )?;
            }
        }

        // Wait for all pending communications to complete
        if overlap_comm {
            solve_data.wait_all()?;
        }

        #[cfg(feature = "logging")]
        log::debug!("Forward solve completed successfully");

        Ok(())
    }

    /// Perform distributed backward substitution (solve Ux = y)
    pub fn backward_solve(
        y: &[f64],
        x: &mut [f64],
        numeric_factor: &NumericFactorization,
        distribution: &BlockCyclicDistribution,
        comm: &UniverseComm,
        comm_pattern: CommPattern,
        overlap_comm: bool,
        #[cfg(feature = "superlu3d")] grid3d: Option<&ProcessGrid3D>,
    ) -> Result<(), KError> {
        #[cfg(feature = "logging")]
        let _guard = StageGuard::new("DistributedBackwardSolve");

        let n = y.len();
        if n == 0 {
            return Ok(());
        }
        let block_size = 64; // Could be made configurable
        let num_blocks = n.div_ceil(block_size);

        #[cfg(feature = "logging")]
        log::debug!(
            "Starting backward solve: n={n}, blocks={num_blocks}, pattern={comm_pattern:?}"
        );

        // Initialize solve data structure
        let mut solve_data = TriangularSolveData::new(
            n,
            block_size,
            distribution,
            numeric_factor,
            numeric_factor.u_block_graph.clone(),
        );

        // Copy intermediate result to solution vector
        x.copy_from_slice(y);

        // Process blocks in reverse dependency order (backward substitution)
        for block_id in (0..num_blocks).rev() {
            let block_start = block_id * block_size;
            let current_block_size = solve_data.block_sizes[block_id];
            let block_end = block_start + current_block_size;

            // Apply updates from later blocks first
            let dependency_blocks = solve_data.dependency_graph[block_id].clone();
            for dep_block in dependency_blocks {
                if solve_data.block_owners[dep_block] != distribution.grid.my_rank {
                    if overlap_comm {
                        solve_data.wait(dep_block, None)?;
                    }
                    Self::apply_block_update_backward(
                        &mut x[block_start..block_end],
                        &solve_data.comm_buffer,
                        dep_block,
                        block_id,
                        &solve_data.local_u_factors,
                    )?;
                }
            }

            // Check if this process owns the current block
            if solve_data.block_owners[block_id] == distribution.grid.my_rank {
                // Perform local dense triangular solve for this block
                Self::solve_local_u_block(
                    &mut x[block_start..block_end],
                    &solve_data.local_u_factors,
                    block_id,
                )?;

                if overlap_comm {
                    // Start nonblocking broadcasts to other processes that need this block
                    Self::start_nonblocking_broadcast(
                        &mut solve_data,
                        &x[block_start..block_end],
                        block_id,
                        distribution,
                        comm_pattern,
                        comm,
                        #[cfg(feature = "superlu3d")]
                        grid3d,
                    )?;
                }
            } else if overlap_comm {
                // Start nonblocking receive for this block
                let owner_rank = solve_data.block_owners[block_id];
                solve_data.irecv(current_block_size, owner_rank, block_id, block_id, comm)?;
                solve_data.wait(block_id, Some(&mut x[block_start..block_end]))?;
            }

            if !overlap_comm {
                // Synchronous broadcast of solution block
                Self::synchronous_broadcast(
                    &mut x[block_start..block_end],
                    solve_data.block_owners[block_id],
                    block_id,
                    comm,
                    comm_pattern,
                    #[cfg(feature = "superlu3d")]
                    grid3d,
                )?;
            }
        }

        // Wait for all pending communications to complete
        if overlap_comm {
            solve_data.wait_all()?;
        }

        #[cfg(feature = "logging")]
        log::debug!("Backward solve completed successfully");

        Ok(())
    }

    /// Solve local dense L block using optimized triangular solve
    fn solve_local_l_block(
        x_block: &mut [f64],
        l_factors: &[Panel],
        block_id: usize,
    ) -> Result<(), KError> {
        if let Some(panel) = l_factors.get(block_id) {
            let m = panel.height;
            let n = panel.width;
            let x_len = x_block.len();
            let mut x = faer::MatMut::from_column_major_slice_mut(x_block, x_len, 1);
            let l = panel.as_faer();
            let k = x_len.min(n).min(m);
            let l_square = l.submatrix(0, 0, k, k);
            let mut x_sub = x.rb_mut().submatrix_mut(0, 0, k, 1);
            faer::linalg::triangular_solve::solve_unit_lower_triangular_in_place(
                l_square,
                x_sub.rb_mut(),
                faer::Par::Seq,
            );
        }

        Ok(())
    }

    /// Solve local dense U block using optimized triangular solve
    fn solve_local_u_block(
        x_block: &mut [f64],
        u_factors: &[Panel],
        block_id: usize,
    ) -> Result<(), KError> {
        if let Some(panel) = u_factors.get(block_id) {
            let m = panel.height;
            let n = panel.width;
            let x_len = x_block.len();
            let mut x = faer::MatMut::from_column_major_slice_mut(x_block, x_len, 1);
            let u = panel.as_faer();
            let k = x_len.min(n).min(m);
            let u_square = u.submatrix(0, 0, k, k);
            let mut x_sub = x.rb_mut().submatrix_mut(0, 0, k, 1);
            faer::linalg::triangular_solve::solve_upper_triangular_in_place(
                u_square,
                x_sub.rb_mut(),
                faer::Par::Seq,
            );
        }

        Ok(())
    }

    /// Start nonblocking broadcast operation
    fn start_nonblocking_broadcast(
        solve_data: &mut TriangularSolveData,
        data: &[f64],
        block_id: usize,
        distribution: &BlockCyclicDistribution,
        comm_pattern: CommPattern,
        comm: &UniverseComm,
        #[cfg(feature = "superlu3d")] grid3d: Option<&ProcessGrid3D>,
    ) -> Result<(), KError> {
        let root_rank = solve_data
            .block_owners
            .get(block_id)
            .copied()
            .unwrap_or(distribution.grid.my_rank);

        #[cfg(feature = "logging")]
        log::debug!(
            "Starting nonblocking broadcast from rank {root_rank} for block {block_id} using {comm_pattern:?}"
        );

        // Simplified broadcast: root sends to all other ranks
        if distribution.grid.my_rank == root_rank {
            for rank in 0..distribution.grid.total_procs {
                if rank != root_rank {
                    solve_data.isend(data, rank, block_id, block_id, comm)?;
                }
            }
        }

        #[cfg(feature = "superlu3d")]
        if let Some(g3) = grid3d {
            // Additional tree along depth dimension
            let layers = g3.pdepth;
            let left = 2 * g3.my_pdepth + 1;
            let right = 2 * g3.my_pdepth + 2;
            if left < layers {
                let r = g3.coords_to_rank(g3.my_prow, g3.my_pcol, left);
                solve_data.isend(data, r, (block_id << 8) + left, block_id, comm)?;
            }
            if right < layers {
                let r = g3.coords_to_rank(g3.my_prow, g3.my_pcol, right);
                solve_data.isend(data, r, (block_id << 8) + right, block_id, comm)?;
            }
        }

        Ok(())
    }

    /// Perform synchronous broadcast operation
    fn synchronous_broadcast(
        data: &mut [f64],
        root_rank: usize,
        block_id: usize,
        comm: &UniverseComm,
        comm_pattern: CommPattern,
        #[cfg(feature = "superlu3d")] grid3d: Option<&ProcessGrid3D>,
    ) -> Result<(), KError> {
        #[cfg(feature = "logging")]
        log::debug!(
            "Synchronous broadcast from rank {root_rank} for block {block_id} using {comm_pattern:?}"
        );

        let size = comm.size();
        if size > 1 {
            let mut reqs = Vec::new();
            if comm.rank() == root_rank {
                for rank in 0..size {
                    if rank != root_rank {
                        reqs.push(comm.isend_to(data, rank as i32));
                    }
                }
            } else {
                reqs.push(comm.irecv_from(data, root_rank as i32));
            }
            comm.wait_all(&mut reqs);
        }

        #[cfg(feature = "superlu3d")]
        if let Some(g3) = grid3d {
            let layers = g3.pdepth;
            let left = 2 * g3.my_pdepth + 1;
            let right = 2 * g3.my_pdepth + 2;
            if left < layers {
                let _ = g3.coords_to_rank(g3.my_prow, g3.my_pcol, left);
            }
            if right < layers {
                let _ = g3.coords_to_rank(g3.my_prow, g3.my_pcol, right);
            }
        }

        Ok(())
    }

    /// Apply block update during triangular solve
    fn apply_block_update(
        x_block: &mut [f64],
        update_data: &[f64],
        source_block: usize,
        target_block: usize,
        l_factors: &[Panel],
    ) -> Result<(), KError> {
        if let Some(l_panel) = l_factors.get(target_block) {
            let m = x_block.len();
            let l = l_panel.as_faer();
            if source_block < l.ncols() && source_block < update_data.len() {
                let col = l.submatrix(0, source_block, m, 1);
                let scalar = faer::MatRef::from_column_major_slice(
                    &update_data[source_block..source_block + 1],
                    1,
                    1,
                );
                let mut x = MatMut::from_column_major_slice_mut(x_block, m, 1);
                faer::linalg::matmul::matmul(
                    x.rb_mut(),
                    faer::Accum::Add,
                    col,
                    scalar,
                    -1.0,
                    faer::Par::Seq,
                );
            }
        }

        Ok(())
    }

    /// Apply block update during backward triangular solve
    fn apply_block_update_backward(
        x_block: &mut [f64],
        update_data: &[f64],
        source_block: usize,
        target_block: usize,
        u_factors: &[Panel],
    ) -> Result<(), KError> {
        if let Some(u_panel) = u_factors.get(target_block) {
            let m = x_block.len();
            let u = u_panel.as_faer();
            if source_block < u.ncols() && source_block < update_data.len() {
                let col = u.submatrix(0, source_block, m, 1);
                let scalar = faer::MatRef::from_column_major_slice(
                    &update_data[source_block..source_block + 1],
                    1,
                    1,
                );
                let mut x = MatMut::from_column_major_slice_mut(x_block, m, 1);
                faer::linalg::matmul::matmul(
                    x.rb_mut(),
                    faer::Accum::Add,
                    col,
                    scalar,
                    -1.0,
                    faer::Par::Seq,
                );
            }
        }

        Ok(())
    }
}
#[derive(Debug, Clone)]
pub struct SuperLuDistOptions {
    /// Process grid dimensions (rows, cols). If None, automatically determined
    pub process_grid: Option<(usize, usize)>,
    /// Column permutation strategy
    pub column_permutation: ColumnPermutation,
    /// Diagonal pivot threshold for numerical stability (0.0 to 1.0)
    pub diagonal_pivot_threshold: f64,
    /// Whether to replace tiny pivots to avoid breakdown
    pub replace_tiny_pivots: bool,
    /// Iterative refinement method
    pub iterative_refinement: IterativeRefinement,
    /// Print level for SuperLU_DIST diagnostics (0=none, 1=basic, 2=detailed)
    pub print_level: u8,
    /// Whether to use static pivoting
    pub static_pivoting: bool,
    /// Row permutation for load balancing
    pub row_permutation: RowPermutation,
    /// Panel size for local dense factorization (None = auto-determined)
    pub panel_size: Option<usize>,
    /// Enable 3D communication-avoiding extension
    pub enable_3d_factorization: bool,
    /// 3D process grid depth (used if enable_3d_factorization is true)
    pub process_grid_3d_depth: Option<usize>,
    /// Memory trade-off factor for 3D algorithm (higher = more memory, less communication)
    pub memory_tradeoff_factor: f64,
    /// Maximum number of panels to process concurrently
    pub max_concurrent_panels: usize,
    /// Enable asynchronous panel updates
    pub async_panel_updates: bool,
}

/// Column permutation strategies for SuperLU_DIST
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ColumnPermutation {
    /// Natural ordering (no permutation)
    Natural,
    /// Minimum degree ordering of A^T + A
    MmdAta,
    /// METIS ordering for graph partitioning
    Metis,
    /// ParMETIS for distributed graph partitioning
    ParMetis,
    /// User-provided permutation
    User,
}

/// Row permutation strategies
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RowPermutation {
    /// No row permutation
    NoRowPerm,
    /// Large diagonal elements first
    LargeDiag,
    /// User-provided permutation
    User,
}

/// Iterative refinement options
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IterativeRefinement {
    /// No iterative refinement
    NoRefine,
    /// Single precision refinement
    Single,
    /// Double precision refinement
    Double,
    /// Extra precision refinement
    Extra,
}

impl Default for SuperLuDistOptions {
    fn default() -> Self {
        Self {
            process_grid: None,
            column_permutation: ColumnPermutation::MmdAta,
            diagonal_pivot_threshold: 1.0,
            replace_tiny_pivots: false,
            iterative_refinement: IterativeRefinement::Double,
            print_level: 0,
            static_pivoting: false,
            row_permutation: RowPermutation::LargeDiag,
            panel_size: None,
            enable_3d_factorization: false,
            process_grid_3d_depth: None,
            memory_tradeoff_factor: 1.0,
            max_concurrent_panels: 1,
            async_panel_updates: false,
        }
    }
}

impl SuperLuDistOptions {
    #[inline]
    pub fn enabled(&self, level: u8, required: u8) -> bool {
        self.print_level >= required && level <= self.print_level
    }

    pub fn validate(&self, comm: Option<&UniverseComm>) -> Result<(), KError> {
        if !(0.0..=1.0).contains(&self.diagonal_pivot_threshold) {
            return Err(KError::InvalidInput(format!(
                "diagonal_pivot_threshold={} must be in [0,1]",
                self.diagonal_pivot_threshold
            )));
        }
        if let Some(sz) = self.panel_size
            && sz == 0
        {
            return Err(KError::InvalidInput("panel_size must be > 0".into()));
        }
        if self.enable_3d_factorization && self.process_grid_3d_depth == Some(0) {
            return Err(KError::InvalidInput("3D depth must be > 0".into()));
        }
        if let Some((r, c)) = self.process_grid {
            if r == 0 || c == 0 {
                return Err(KError::InvalidInput("process_grid dims must be > 0".into()));
            }
            if let Some(comm) = comm {
                let sz = comm.size();
                if r * c != sz {
                    return Err(KError::InvalidInput(format!(
                        "process_grid {r}x{c} does not match comm size {sz}"
                    )));
                }
            }
        }
        if self.memory_tradeoff_factor.is_nan() || self.memory_tradeoff_factor < 0.1 {
            return Err(KError::InvalidInput(
                "memory_tradeoff_factor must be >= 0.1".into(),
            ));
        }
        Ok(())
    }
}

/// Graph structure for ordering algorithms
#[derive(Debug, Clone)]
#[allow(dead_code)]
struct Graph {
    /// Number of vertices
    #[allow(dead_code)]
    n: usize,
    /// Adjacency lists for each vertex
    adj: Vec<Vec<usize>>,
}

#[allow(dead_code)]
impl Graph {
    /// Create graph from sparse matrix (A + A^T pattern)
    fn from_matrix_pattern(matrix: &CsrMatrix<f64>) -> Self {
        let n = matrix.nrows();
        let mut adj = vec![Vec::new(); n];

        // Get matrix pattern
        let row_ptrs = matrix.row_ptr();
        let col_indices = matrix.col_idx();

        // Add edges from A
        for i in 0..n {
            for idx in row_ptrs[i]..row_ptrs[i + 1] {
                let j = col_indices[idx];
                if i != j {
                    // Skip diagonal
                    adj[i].push(j);
                }
            }
        }

        // Add edges from A^T (make symmetric)
        let mut transpose_edges = vec![Vec::new(); n];
        for i in 0..n {
            for &j in &adj[i] {
                transpose_edges[j].push(i);
            }
        }

        // Merge and sort adjacency lists
        for i in 0..n {
            adj[i].extend(&transpose_edges[i]);
            adj[i].sort_unstable();
            adj[i].dedup();
        }

        Self { n, adj }
    }

    /// Get degree of vertex
    fn degree(&self, v: usize) -> usize {
        self.adj[v].len()
    }

    /// Get neighbors of vertex
    fn neighbors(&self, v: usize) -> &[usize] {
        &self.adj[v]
    }

    /// Remove vertex and update adjacency lists
    fn eliminate_vertex(&mut self, v: usize, eliminated: &[bool]) -> Vec<(usize, usize)> {
        let mut new_edges = Vec::new();
        let neighbors: Vec<usize> = self.adj[v]
            .iter()
            .filter(|&&u| !eliminated[u])
            .copied()
            .collect();

        // Add clique edges between neighbors
        for i in 0..neighbors.len() {
            for j in i + 1..neighbors.len() {
                let u = neighbors[i];
                let w = neighbors[j];

                // Add edge if not already present
                if !self.adj[u].contains(&w) {
                    new_edges.push((u, w));
                    new_edges.push((w, u));
                }
            }
        }

        // Apply new edges
        for (u, v) in &new_edges {
            self.adj[*u].push(*v);
            self.adj[*u].sort_unstable();
        }

        new_edges
    }
}

/// Elimination tree structure
#[derive(Debug, Clone)]
pub struct EliminationTree {
    /// Parent array (parent[i] = parent of node i, or n if root)
    pub parent: Vec<usize>,
    /// Children lists for each node
    pub children: Vec<Vec<usize>>,
    /// Post-order traversal
    pub post_order: Vec<usize>,
}

impl EliminationTree {
    /// Create elimination tree from symbolic factorization
    fn from_symbolic_factor(n: usize, l_pattern: &HashMap<(usize, usize), bool>) -> Self {
        let mut parent = vec![n; n]; // n means no parent (root)

        // For each column j, find the first row i > j with L[i,j] != 0
        // This gives us parent[j] = i
        for j in 0..n {
            for i in (j + 1)..n {
                if l_pattern.contains_key(&(i, j)) {
                    parent[j] = i;
                    break;
                }
            }
        }

        // Build children lists
        let mut children = vec![Vec::new(); n];
        for (child, &par) in parent.iter().enumerate() {
            if par < n {
                children[par].push(child);
            }
        }

        // Compute post-order traversal
        let mut post_order = Vec::new();
        let mut visited = vec![false; n];

        fn dfs_post_order(
            v: usize,
            children: &[Vec<usize>],
            visited: &mut [bool],
            post_order: &mut Vec<usize>,
        ) {
            visited[v] = true;
            for &child in &children[v] {
                if !visited[child] {
                    dfs_post_order(child, children, visited, post_order);
                }
            }
            post_order.push(v);
        }

        // Visit all roots
        for i in 0..n {
            if parent[i] == n && !visited[i] {
                dfs_post_order(i, &children, &mut visited, &mut post_order);
            }
        }

        Self {
            parent,
            children,
            post_order,
        }
    }

    /// Get ancestors of a node in the elimination tree
    pub fn ancestors(&self, node: usize) -> Vec<usize> {
        let mut ancestors = Vec::new();
        let mut current = self.parent[node];

        while current < self.parent.len() {
            ancestors.push(current);
            current = self.parent[current];
        }

        ancestors
    }
}

/// Ordering algorithms implementation
pub struct OrderingAlgorithms;

impl OrderingAlgorithms {
    /// Natural ordering (identity permutation)
    pub fn natural_ordering(n: usize) -> Vec<usize> {
        (0..n).collect()
    }

    /// Approximate Minimum Degree (AMD) ordering
    pub fn amd_ordering(matrix: &CsrMatrix<f64>) -> Vec<usize> {
        let n = matrix.nrows();
        let mut graph = Graph::from_matrix_pattern(matrix);
        let mut perm = Vec::new();
        let mut eliminated = vec![false; n];

        // Main AMD loop
        for _ in 0..n {
            // Find vertex with minimum degree among non-eliminated vertices
            let mut min_degree = usize::MAX;
            let mut min_vertex = 0;

            for v in 0..n {
                if !eliminated[v] {
                    let degree = graph.adj[v].iter().filter(|&&u| !eliminated[u]).count();

                    if degree < min_degree {
                        min_degree = degree;
                        min_vertex = v;
                    }
                }
            }

            // Eliminate the minimum degree vertex
            perm.push(min_vertex);
            eliminated[min_vertex] = true;
            graph.eliminate_vertex(min_vertex, &eliminated);
        }

        perm
    }

    /// Minimum Degree on A + A^T structure
    pub fn mmd_ata_ordering(matrix: &CsrMatrix<f64>) -> Vec<usize> {
        let n = matrix.nrows();
        if n == 0 {
            return Vec::new();
        }

        // Build the pattern of A + A^T
        let ata_graph = Self::build_ata_graph(matrix);

        // Initialize data structures for MMD
        let mut degree = vec![0; n];
        let mut eliminated = vec![false; n];
        let mut ordering = Vec::with_capacity(n);
        let mut adj_lists = ata_graph.adj.clone();

        // Compute initial degrees
        for i in 0..n {
            degree[i] = adj_lists[i].len();
        }

        // Main MMD elimination loop
        for step in 0..n {
            // Find minimum degree vertex among non-eliminated vertices
            let pivot = Self::select_minimum_degree_vertex(&degree, &eliminated);
            if pivot >= n {
                // Should not happen in a well-formed algorithm
                break;
            }

            ordering.push(pivot);
            eliminated[pivot] = true;

            #[cfg(feature = "logging")]
            if step % 1000 == 0 {
                log::debug!(
                    "MMD step {}/{}, pivot {} with degree {}",
                    step,
                    n,
                    pivot,
                    degree[pivot]
                );
            }

            // Suppress unused variable warning when logging is disabled
            #[cfg(not(feature = "logging"))]
            let _ = step;

            // Get neighbors of pivot before elimination
            let pivot_neighbors: Vec<usize> = adj_lists[pivot]
                .iter()
                .filter(|&&v| !eliminated[v])
                .copied()
                .collect();

            // Perform elimination: add edges between all pairs of neighbors
            let new_edges =
                Self::eliminate_vertex_mmd(pivot, &pivot_neighbors, &mut adj_lists, &eliminated);

            // Update degrees efficiently
            Self::update_degrees_after_elimination(
                pivot,
                &pivot_neighbors,
                &new_edges,
                &mut degree,
                &eliminated,
                &adj_lists,
            );
        }

        ordering
    }

    /// MMD ordering using A + A^T pattern with distributed assembly
    /// This version properly handles distributed matrices by gathering global pattern
    pub fn mmd_ata_ordering_distributed(
        matrix: &CsrMatrix<f64>,
        comm: &UniverseComm,
        distribution: &BlockCyclicDistribution,
    ) -> Result<Vec<usize>, KError> {
        let n = matrix.nrows();

        // Step 1: Assemble the global sparsity pattern across all ranks.
        let global_pattern = if comm.size() <= 1 {
            matrix.clone()
        } else {
            #[cfg(feature = "mpi")]
            {
                match comm {
                    UniverseComm::Mpi(comm_impl) => {
                        let global_rows = distribution.global_rows;
                        let global_cols = distribution.global_cols;
                        let mut adjacency = vec![Vec::<usize>::new(); global_rows];

                        let rp = matrix.row_ptr();
                        let ci = matrix.col_idx();
                        let local_rows = distribution.local_rows;
                        let mut encoded: Vec<usize> = Vec::new();
                        for local_row in 0..local_rows {
                            let global_row = distribution.local_to_global_row(local_row);
                            let start = rp[global_row];
                            let end = rp[global_row + 1];
                            encoded.push(global_row);
                            encoded.push(end - start);
                            encoded.extend_from_slice(&ci[start..end]);
                        }

                        let local_len = encoded.len() as i32;
                        let mut lengths = vec![0i32; comm_impl.size];
                        comm_impl
                            .world
                            .all_gather_into(&local_len, &mut lengths[..]);
                        let lengths: Vec<usize> = lengths.iter().map(|&l| l as usize).collect();
                        let max_len = lengths.iter().copied().max().unwrap_or(0);

                        if max_len > 0 {
                            let mut padded = encoded.clone();
                            padded.resize(max_len, usize::MAX);
                            let mut gathered = vec![0usize; max_len * comm_impl.size];
                            comm_impl
                                .world
                                .all_gather_into(&padded[..], &mut gathered[..]);

                            for (rank_idx, &len) in lengths.iter().enumerate() {
                                let chunk = &gathered[rank_idx * max_len..rank_idx * max_len + len];
                                let mut idx = 0;
                                while idx < chunk.len() {
                                    if idx + 1 >= chunk.len() {
                                        break;
                                    }
                                    let global_row = chunk[idx];
                                    idx += 1;
                                    let nnz = chunk[idx];
                                    idx += 1;
                                    if idx + nnz > chunk.len() {
                                        break;
                                    }
                                    adjacency[global_row].extend_from_slice(&chunk[idx..idx + nnz]);
                                    idx += nnz;
                                }
                            }
                        }

                        let mut row_ptr = Vec::with_capacity(global_rows + 1);
                        let mut col_idx = Vec::new();
                        row_ptr.push(0);
                        for cols in adjacency.iter_mut() {
                            cols.sort_unstable();
                            cols.dedup();
                            col_idx.extend_from_slice(cols);
                            row_ptr.push(col_idx.len());
                        }
                        let values = vec![0.0; col_idx.len()];
                        CsrMatrix::from_csr(global_rows, global_cols, row_ptr, col_idx, values)
                    }
                    _ => matrix.clone(),
                }
            }
            #[cfg(not(feature = "mpi"))]
            {
                let _ = distribution;
                matrix.clone()
            }
        };

        // Step 2: Build A + A^T graph
        let graph = Self::build_ata_graph(&global_pattern);

        #[cfg(not(feature = "logging"))]
        let _ = comm;

        // Step 3: Run MMD on the global graph (replicated computation)
        let mut adj_lists = graph.adj;
        let mut ordering = Vec::with_capacity(n);
        let mut eliminated = vec![false; n];
        let mut degree: Vec<usize> = adj_lists.iter().map(|adj| adj.len()).collect();

        #[cfg(feature = "logging")]
        if comm.rank() == 0 {
            log::debug!("Starting distributed MMD ordering for matrix {n}x{n}");
        }

        // Main MMD elimination loop (same algorithm, but only log on rank 0)
        for step in 0..n {
            #[cfg(not(feature = "logging"))]
            let _ = step;
            let pivot = Self::select_minimum_degree_vertex(&degree, &eliminated);
            if pivot >= n {
                break;
            }

            ordering.push(pivot);
            eliminated[pivot] = true;

            #[cfg(feature = "logging")]
            if step % 1000 == 0 && comm.rank() == 0 {
                log::debug!(
                    "Distributed MMD step {}/{}, pivot {} with degree {}",
                    step,
                    n,
                    pivot,
                    degree[pivot]
                );
            }

            let pivot_neighbors: Vec<usize> = adj_lists[pivot]
                .iter()
                .filter(|&&v| !eliminated[v])
                .copied()
                .collect();

            let new_edges =
                Self::eliminate_vertex_mmd(pivot, &pivot_neighbors, &mut adj_lists, &eliminated);

            Self::update_degrees_after_elimination(
                pivot,
                &pivot_neighbors,
                &new_edges,
                &mut degree,
                &eliminated,
                &adj_lists,
            );
        }

        Ok(ordering)
    }

    /// Build the A + A^T graph structure for MMD ordering
    fn build_ata_graph(matrix: &CsrMatrix<f64>) -> Graph {
        let n = matrix.nrows();
        let mut adj = vec![std::collections::BTreeSet::new(); n];

        let row_ptrs = matrix.row_ptr();
        let col_indices = matrix.col_idx();

        // Add edges from A (i -> j if A[i,j] != 0)
        for i in 0..n {
            for idx in row_ptrs[i]..row_ptrs[i + 1] {
                let j = col_indices[idx];
                if i != j && j < n {
                    adj[i].insert(j);
                }
            }
        }

        // Add edges from A^T (j -> i if A[i,j] != 0)
        for i in 0..n {
            for idx in row_ptrs[i]..row_ptrs[i + 1] {
                let j = col_indices[idx];
                if i != j && j < n {
                    adj[j].insert(i);
                }
            }
        }

        // Convert BTreeSet to Vec for efficiency
        let adj_vec: Vec<Vec<usize>> = adj
            .into_iter()
            .map(|set| set.into_iter().collect())
            .collect();

        Graph { n, adj: adj_vec }
    }

    /// Select vertex with minimum degree among non-eliminated vertices
    fn select_minimum_degree_vertex(degree: &[usize], eliminated: &[bool]) -> usize {
        let mut min_degree = usize::MAX;
        let mut min_vertex = usize::MAX;

        for (i, &deg) in degree.iter().enumerate() {
            if !eliminated[i] && deg < min_degree {
                min_degree = deg;
                min_vertex = i;
            }
        }

        min_vertex
    }

    /// Eliminate vertex in MMD and return new edges created
    fn eliminate_vertex_mmd(
        pivot: usize,
        neighbors: &[usize],
        adj_lists: &mut [Vec<usize>],
        eliminated: &[bool],
    ) -> Vec<(usize, usize)> {
        let mut new_edges = Vec::new();

        // Add clique edges between all pairs of active neighbors
        for i in 0..neighbors.len() {
            for j in (i + 1)..neighbors.len() {
                let u = neighbors[i];
                let v = neighbors[j];

                if !eliminated[u] && !eliminated[v] {
                    // Check if edge (u,v) already exists
                    if !adj_lists[u].contains(&v) {
                        adj_lists[u].push(v);
                        adj_lists[v].push(u);
                        new_edges.push((u, v));
                    }
                }
            }
        }

        // Remove pivot from all adjacency lists
        for &neighbor in neighbors {
            if !eliminated[neighbor] {
                adj_lists[neighbor].retain(|&x| x != pivot);
            }
        }

        // Sort adjacency lists to maintain order
        for &neighbor in neighbors {
            if !eliminated[neighbor] {
                adj_lists[neighbor].sort_unstable();
            }
        }

        new_edges
    }

    /// Update degrees after vertex elimination
    fn update_degrees_after_elimination(
        pivot: usize,
        pivot_neighbors: &[usize],
        new_edges: &[(usize, usize)],
        degree: &mut [usize],
        eliminated: &[bool],
        adj_lists: &[Vec<usize>],
    ) {
        // Set pivot degree to 0 (eliminated)
        degree[pivot] = 0;

        // Update degrees for vertices affected by new edges
        let mut affected_vertices = std::collections::HashSet::new();

        // Collect all vertices that might have degree changes
        for &v in pivot_neighbors {
            if !eliminated[v] {
                affected_vertices.insert(v);
            }
        }

        for &(u, v) in new_edges {
            if !eliminated[u] {
                affected_vertices.insert(u);
            }
            if !eliminated[v] {
                affected_vertices.insert(v);
            }
        }

        // Recompute degrees for affected vertices
        for &v in &affected_vertices {
            if !eliminated[v] {
                degree[v] = adj_lists[v].iter().filter(|&&u| !eliminated[u]).count();
            }
        }
    }

    /// METIS ordering (placeholder - would interface with METIS C library)
    pub fn metis_ordering(matrix: &CsrMatrix<f64>) -> Result<Vec<usize>, KError> {
        // Placeholder implementation that falls back to AMD
        // In a real implementation, this would call METIS C library
        #[cfg(feature = "logging")]
        log::warn!("METIS ordering not implemented, falling back to AMD");

        Ok(Self::amd_ordering(matrix))
    }

    /// ParMETIS ordering for distributed graphs
    pub fn parmetis_ordering(
        matrix: &CsrMatrix<f64>,
        comm: &UniverseComm,
        distribution: &BlockCyclicDistribution,
    ) -> Result<Vec<usize>, KError> {
        if comm.size() <= 1 {
            return Ok(Self::mmd_ata_ordering(matrix));
        }

        #[cfg(feature = "logging")]
        if comm.rank() == 0 {
            log::debug!("Using distributed MMD ordering for ParMETIS path");
        }

        Self::mmd_ata_ordering_distributed(matrix, comm, distribution)
    }
}

/// Symbolic factorization implementation
pub struct SymbolicFactorizer;

impl SymbolicFactorizer {
    /// Compute symbolic factorization pattern
    /// Compute symbolic pattern using elimination tree and reachability analysis
    /// This improved version uses proper graph reachability instead of naive nested loops
    pub fn compute_symbolic_pattern(
        matrix: &CsrMatrix<f64>,
        col_perm: &[usize],
        row_perm: &[usize],
    ) -> Result<HashMap<(usize, usize), bool>, KError> {
        let n = matrix.nrows();

        // Step 1: Build elimination tree first
        let etree = Self::build_elimination_tree_from_matrix(matrix, col_perm, row_perm)?;

        // Step 2: Use reachability analysis on elimination tree
        let mut l_pattern = HashMap::new();
        let mut visited = vec![false; n];
        let mut reach_set = Vec::new();

        // Create permuted matrix access
        let row_ptrs = matrix.row_ptr();
        let col_indices = matrix.col_idx();

        // Process columns in elimination order
        for k in 0..n {
            // Clear working arrays
            visited.fill(false);
            reach_set.clear();

            // Find reachable set for column k using elimination tree
            Self::compute_reach_set(
                k,
                &etree,
                row_ptrs,
                col_indices,
                row_perm,
                col_perm,
                &mut visited,
                &mut reach_set,
            );

            // Add reachable nodes to L pattern
            for &i in &reach_set {
                if i >= k {
                    // Only lower triangular part
                    l_pattern.insert((i, k), true);
                }
            }

            // Always include diagonal
            l_pattern.insert((k, k), true);
        }

        #[cfg(feature = "logging")]
        log::debug!(
            "Symbolic factorization computed {} nonzeros in L factor",
            l_pattern.len()
        );

        Ok(l_pattern)
    }

    /// Build elimination tree from matrix structure (improved algorithm)
    fn build_elimination_tree_from_matrix(
        matrix: &CsrMatrix<f64>,
        col_perm: &[usize],
        row_perm: &[usize],
    ) -> Result<Vec<usize>, KError> {
        let n = matrix.nrows();
        let mut parent = vec![n; n]; // n means no parent
        let mut ancestor = vec![0; n];

        let row_ptrs = matrix.row_ptr();
        let col_indices = matrix.col_idx();

        // Process columns in order
        for k in 0..n {
            parent[k] = n; // Initialize as root
            ancestor[k] = k;

            // Find permuted row index
            let perm_row = row_perm[k];
            let start = row_ptrs[perm_row];
            let end = row_ptrs[perm_row + 1];

            for idx in start..end {
                let orig_col = col_indices[idx];
                // Find permuted column position
                if let Some(j) = col_perm.iter().position(|&c| c == orig_col)
                    && j < k
                {
                    // Follow path compression for efficiency
                    let mut root = j;
                    while ancestor[root] != root && ancestor[root] < k {
                        root = ancestor[root];
                    }

                    // Set parent relationship
                    if parent[root] == n {
                        parent[root] = k;
                    }
                    ancestor[j] = k;
                }
            }
        }

        Ok(parent)
    }

    /// Compute reachable set using depth-first search on elimination tree
    fn compute_reach_set(
        col: usize,
        etree: &[usize],
        row_ptrs: &[usize],
        col_indices: &[usize],
        row_perm: &[usize],
        col_perm: &[usize],
        visited: &mut [bool],
        reach_set: &mut Vec<usize>,
    ) {
        let n = etree.len();

        // Start DFS from column col
        if col < n && !visited[col] {
            Self::dfs_reach(col, etree, visited, reach_set);
        }

        // Also follow original matrix structure for this column
        if col < row_perm.len() {
            let perm_row = row_perm[col];
            if perm_row < row_ptrs.len() - 1 {
                let start = row_ptrs[perm_row];
                let end = row_ptrs[perm_row + 1];

                for idx in start..end {
                    let orig_col = col_indices[idx];
                    if let Some(j) = col_perm.iter().position(|&c| c == orig_col)
                        && j < col
                        && !visited[j]
                    {
                        Self::dfs_reach(j, etree, visited, reach_set);
                    }
                }
            }
        }
    }

    /// Depth-first search for reachability on elimination tree
    fn dfs_reach(node: usize, etree: &[usize], visited: &mut [bool], reach_set: &mut Vec<usize>) {
        let n = etree.len();
        if node >= n || visited[node] {
            return;
        }

        visited[node] = true;
        reach_set.push(node);

        // Visit parent in elimination tree
        let parent = etree[node];
        if parent < n {
            Self::dfs_reach(parent, etree, visited, reach_set);
        }
    }

    /// Build elimination tree from symbolic pattern
    pub fn build_elimination_tree(
        n: usize,
        l_pattern: &HashMap<(usize, usize), bool>,
    ) -> EliminationTree {
        EliminationTree::from_symbolic_factor(n, l_pattern)
    }
}

/// SuperLU_DIST data structure for managing factorization state
///
/// This structure encapsulates all the SuperLU_DIST internal data structures
/// needed for setup, factorization, and solve phases. It includes the process grid
/// and block-cyclic matrix distribution for distributed computation.
pub struct SuperLuDistData {
    /// Process grid for 2D distribution
    pub process_grid: ProcessGrid,
    /// Block-cyclic matrix distribution
    pub distribution: BlockCyclicDistribution,
    /// Communicator used for setup/factorization
    pub comm: UniverseComm,
    /// Factorization options
    pub options: SuperLuDistOptions,
    /// Whether factorization has been computed
    pub factored: bool,
    /// Local matrix data in CSR format
    pub local_matrix: Option<CsrMatrix<f64>>,
    /// Symbolic factorization data
    symbolic_factor: Option<SymbolicFactorization>,
    /// Numerical factorization data
    numeric_factor: Option<NumericFactorization>,
    /// Solve workspace data
    solve_workspace: Option<SolveWorkspace>,
}

/// Symbolic factorization data for SuperLU_DIST structures
#[derive(Debug, Clone)]
pub struct SymbolicFactorization {
    /// Column permutation vector
    pub col_perm: Vec<usize>,
    /// Row permutation vector
    pub row_perm: Vec<usize>,
    /// Elimination tree
    pub etree: EliminationTree,
    /// Symbolic pattern of L factor
    pub l_pattern: HashMap<(usize, usize), bool>,
    /// Symbolic pattern of U factor (computed from L^T)
    pub u_pattern: HashMap<(usize, usize), bool>,
}

/// Solve workspace for SuperLU_DIST solve structures
#[derive(Debug)]
pub struct SolveWorkspace {
    /// Advanced workspace management
    pub workspace: SuperLuDistWorkspace,
    /// Process-specific temporary vectors
    pub process_vectors: HashMap<usize, Vec<f64>>,
    /// Global temporary vectors for collective operations
    pub global_vectors: HashMap<String, Vec<f64>>,
}

impl SuperLuDistData {
    /// Get symbolic factorization data
    pub fn symbolic_factor(&self) -> Option<&SymbolicFactorization> {
        self.symbolic_factor.as_ref()
    }

    /// Get numeric factorization data
    pub fn numeric_factor(&self) -> Option<&NumericFactorization> {
        self.numeric_factor.as_ref()
    }

    /// Get solve workspace data
    pub fn solve_workspace(&self) -> Option<&SolveWorkspace> {
        self.solve_workspace.as_ref()
    }

    /// Set symbolic factorization data
    pub fn set_symbolic_factor(&mut self, factor: SymbolicFactorization) {
        self.symbolic_factor = Some(factor);
    }

    /// Set numeric factorization data
    pub fn set_numeric_factor(&mut self, factor: NumericFactorization) {
        self.numeric_factor = Some(factor);
    }

    /// Set solve workspace data
    pub fn set_solve_workspace(&mut self, workspace: SolveWorkspace) {
        self.solve_workspace = Some(workspace);
    }
}

/// Iterative refinement configuration
#[derive(Debug, Clone)]
pub struct RefinementConfig {
    /// Maximum number of refinement iterations
    pub max_iterations: usize,
    /// Convergence tolerance for residual norm
    pub tolerance: f64,
    /// Relative tolerance (relative to initial residual)
    pub relative_tolerance: f64,
    /// Minimum improvement factor to continue refinement
    pub min_improvement_factor: f64,
}

impl Default for RefinementConfig {
    fn default() -> Self {
        Self {
            max_iterations: 5,
            tolerance: 1e-12,
            relative_tolerance: 1e-6,
            min_improvement_factor: 0.9,
        }
    }
}

/// Residual computation method for distributed matrices
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ResidualMethod {
    /// Standard residual: r = b - A*x
    Standard,
    /// Scaled residual: r = (b - A*x) / ||b||
    Scaled,
    /// Component-wise scaled: r_i = (b_i - (A*x)_i) / max(|b_i|, |(A*x)_i|)
    ComponentWise,
}

/// Iterative refinement engine for SuperLU_DIST
#[derive(Debug)]
pub struct RefinementEngine {
    /// Refinement configuration
    config: RefinementConfig,
    /// Residual computation method
    residual_method: ResidualMethod,
    /// Workspace for residual computation
    residual_workspace: Vec<f64>,
    /// Workspace for correction vector
    correction_workspace: Vec<f64>,
    /// Workspace for matrix-vector product
    matvec_workspace: Vec<f64>,
    /// Statistics from last refinement
    last_stats: Option<RefinementStats>,
}

/// Statistics from iterative refinement
#[derive(Debug, Clone)]
pub struct RefinementStats {
    /// Number of refinement iterations performed
    pub iterations: usize,
    /// Initial residual norm
    pub initial_residual_norm: f64,
    /// Final residual norm
    pub final_residual_norm: f64,
    /// Residual norms at each iteration
    pub residual_history: Vec<f64>,
    /// Whether refinement converged
    pub converged: bool,
    /// Convergence reason
    pub convergence_reason: RefinementConvergence,
    /// Total time spent in refinement
    pub refinement_time: f64,
}

/// Convergence reasons for iterative refinement
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RefinementConvergence {
    /// Converged to absolute tolerance
    AbsoluteTolerance,
    /// Converged to relative tolerance
    RelativeTolerance,
    /// Reached maximum iterations
    MaxIterations,
    /// Stagnation detected (insufficient improvement)
    Stagnation,
    /// Divergence detected
    Divergence,
}

impl RefinementEngine {
    /// Create new iterative refinement engine
    pub fn new(config: RefinementConfig, residual_method: ResidualMethod) -> Self {
        Self {
            config,
            residual_method,
            residual_workspace: Vec::new(),
            correction_workspace: Vec::new(),
            matvec_workspace: Vec::new(),
            last_stats: None,
        }
    }

    /// Create with default configuration
    pub fn with_defaults() -> Self {
        Self::new(RefinementConfig::default(), ResidualMethod::Standard)
    }

    /// Setup workspace for given problem size
    pub fn setup_workspace(&mut self, n: usize) {
        self.residual_workspace.resize(n, 0.0);
        self.correction_workspace.resize(n, 0.0);
        self.matvec_workspace.resize(n, 0.0);
    }

    /// Perform iterative refinement on the solution
    pub fn refine_solution(
        &mut self,
        matrix: &CsrMatrix<f64>,
        rhs: &[f64],
        solution: &mut [f64],
        superlu_data: &SuperLuDistData,
        comm: &UniverseComm,
    ) -> Result<RefinementStats, KError> {
        let n = solution.len();
        if n != rhs.len() {
            return Err(KError::InvalidInput(
                "Solution and RHS dimension mismatch".to_string(),
            ));
        }

        self.setup_workspace(n);

        let start_time = std::time::Instant::now();
        let mut stats = RefinementStats {
            iterations: 0,
            initial_residual_norm: 0.0,
            final_residual_norm: 0.0,
            residual_history: Vec::new(),
            converged: false,
            convergence_reason: RefinementConvergence::MaxIterations,
            refinement_time: 0.0,
        };

        // Clone workspace vectors to avoid borrowing conflicts
        let mut residual_workspace = self.residual_workspace.clone();
        let mut correction_workspace = self.correction_workspace.clone();
        let mut matvec_workspace = self.matvec_workspace.clone();

        // Compute initial residual: r = b - A*x
        Self::compute_residual_static(
            matrix,
            rhs,
            solution,
            &mut residual_workspace,
            &mut matvec_workspace,
            self.residual_method,
            comm,
        )?;

        let initial_residual_norm = Self::compute_residual_norm_static(&residual_workspace, comm)?;
        stats.initial_residual_norm = initial_residual_norm;
        stats.final_residual_norm = initial_residual_norm;
        stats.residual_history.push(initial_residual_norm);

        // Check if already converged
        if self.check_convergence(initial_residual_norm, initial_residual_norm, 0) {
            stats.converged = true;
            stats.convergence_reason = RefinementConvergence::AbsoluteTolerance;
            stats.refinement_time = start_time.elapsed().as_secs_f64();
            self.last_stats = Some(stats.clone());
            return Ok(stats);
        }

        // Refinement loop
        let mut previous_residual_norm = initial_residual_norm;

        for iter in 0..self.config.max_iterations {
            stats.iterations = iter + 1;

            // Solve correction equation: A * dx = r
            Self::solve_correction_static(
                &residual_workspace,
                &mut correction_workspace,
                superlu_data,
                comm,
            )?;

            // Update solution: x += dx
            for i in 0..n {
                solution[i] += correction_workspace[i];
            }

            // Compute new residual: r = b - A*x
            Self::compute_residual_static(
                matrix,
                rhs,
                solution,
                &mut residual_workspace,
                &mut matvec_workspace,
                self.residual_method,
                comm,
            )?;

            let residual_norm = Self::compute_residual_norm_static(&residual_workspace, comm)?;
            stats.final_residual_norm = residual_norm;
            stats.residual_history.push(residual_norm);

            // Check convergence
            if self.check_convergence(residual_norm, initial_residual_norm, iter + 1) {
                stats.converged = true;
                stats.convergence_reason = if residual_norm <= self.config.tolerance {
                    RefinementConvergence::AbsoluteTolerance
                } else {
                    RefinementConvergence::RelativeTolerance
                };
                break;
            }

            // Check for stagnation
            let improvement_factor = residual_norm / previous_residual_norm;
            if improvement_factor > self.config.min_improvement_factor {
                stats.convergence_reason = RefinementConvergence::Stagnation;
                break;
            }

            // Check for divergence
            if residual_norm > initial_residual_norm * 10.0 {
                stats.convergence_reason = RefinementConvergence::Divergence;
                break;
            }

            previous_residual_norm = residual_norm;
        }

        stats.refinement_time = start_time.elapsed().as_secs_f64();
        self.last_stats = Some(stats.clone());
        Ok(stats)
    }

    /// Compute residual r = b - A*x using distributed sparse matrix-vector product (static version)
    fn compute_residual_static(
        matrix: &CsrMatrix<f64>,
        rhs: &[f64],
        solution: &[f64],
        residual: &mut [f64],
        matvec_workspace: &mut [f64],
        residual_method: ResidualMethod,
        comm: &UniverseComm,
    ) -> Result<(), KError> {
        // Initialize residual with RHS
        residual.copy_from_slice(rhs);

        // Compute A*x and subtract from residual
        Self::distributed_sparse_matvec_static(matrix, solution, matvec_workspace, comm)?;

        // r = b - A*x
        for i in 0..residual.len() {
            residual[i] -= matvec_workspace[i];
        }

        // Apply residual method scaling if needed
        match residual_method {
            ResidualMethod::Standard => {
                // No scaling needed
            }
            ResidualMethod::Scaled => {
                let rhs_norm = Self::compute_vector_norm_static(rhs, comm)?;
                if rhs_norm > 0.0 {
                    for r in residual.iter_mut() {
                        *r /= rhs_norm;
                    }
                }
            }
            ResidualMethod::ComponentWise => {
                for i in 0..residual.len() {
                    let scale = f64::max(rhs[i].abs(), matvec_workspace[i].abs());
                    if scale > 0.0 {
                        residual[i] /= scale;
                    }
                }
            }
        }

        Ok(())
    }

    /// Perform distributed sparse matrix-vector product (static version)
    fn distributed_sparse_matvec_static(
        matrix: &CsrMatrix<f64>,
        x: &[f64],
        y: &mut [f64],
        _comm: &UniverseComm,
    ) -> Result<(), KError> {
        // For now, perform local matrix-vector product
        // In a full MPI implementation, this would handle communication
        // for distributed vector components

        let row_ptrs = matrix.row_ptr();
        let col_indices = matrix.col_idx();
        let values = matrix.values();

        y.fill(0.0);

        for i in 0..matrix.nrows() {
            for idx in row_ptrs[i]..row_ptrs[i + 1] {
                let j = col_indices[idx];
                let val = values[idx];
                y[i] += val * x[j];
            }
        }

        Ok(())
    }

    /// Solve correction equation A * dx = r using existing factorization (static version)
    fn solve_correction_static(
        residual: &[f64],
        correction: &mut [f64],
        superlu_data: &SuperLuDistData,
        comm: &UniverseComm,
    ) -> Result<(), KError> {
        // Use the existing triangular solve infrastructure
        let numeric_factor = superlu_data
            .numeric_factor
            .as_ref()
            .ok_or_else(|| KError::SolveError("Numeric factorization not available".to_string()))?;

        // Copy residual to correction as initial guess
        correction.copy_from_slice(residual);

        // Create temporary vector for intermediate result
        let mut temp_result = correction.to_vec();

        // Use the existing distributed triangular solve methods
        // Forward solve: L * y = r
        DistributedTriangularSolver::forward_solve(
            residual,
            &mut temp_result,
            numeric_factor,
            &superlu_data.distribution,
            comm,
            CommPattern::PointToPoint,
            false,
            #[cfg(feature = "superlu3d")]
            None,
        )?;

        // Backward solve: U * dx = y
        DistributedTriangularSolver::backward_solve(
            &temp_result,
            correction,
            numeric_factor,
            &superlu_data.distribution,
            comm,
            CommPattern::PointToPoint,
            false,
            #[cfg(feature = "superlu3d")]
            None,
        )?;

        Ok(())
    }

    /// Compute norm of residual vector (distributed) (static version)
    /// Compute residual norm with proper MPI reduction for distributed matrices
    fn compute_residual_norm_static(residual: &[f64], comm: &UniverseComm) -> Result<f64, KError> {
        let local_norm_sq: f64 = residual.iter().map(|x| x * x).sum();
        let global_norm_sq = comm.all_reduce_f64(local_norm_sq);
        Ok(global_norm_sq.sqrt())
    }

    /// Compute norm of a vector with proper MPI reduction for distributed vectors
    fn compute_vector_norm_static(vector: &[f64], comm: &UniverseComm) -> Result<f64, KError> {
        let local_norm_sq: f64 = vector.iter().map(|x| x * x).sum();
        let global_norm_sq = comm.all_reduce_f64(local_norm_sq);
        Ok(global_norm_sq.sqrt())
    }

    /// Check convergence criteria
    fn check_convergence(&self, current_norm: f64, initial_norm: f64, iteration: usize) -> bool {
        if iteration == 0 {
            return false; // Never converge on first iteration
        }

        // Absolute tolerance check
        if current_norm <= self.config.tolerance {
            return true;
        }

        // Relative tolerance check
        if initial_norm > 0.0 && current_norm / initial_norm <= self.config.relative_tolerance {
            return true;
        }

        false
    }

    /// Get statistics from last refinement
    pub fn last_stats(&self) -> Option<&RefinementStats> {
        self.last_stats.as_ref()
    }

    /// Update configuration
    /// Get current refinement configuration
    pub fn config(&self) -> &RefinementConfig {
        &self.config
    }

    pub fn set_config(&mut self, config: RefinementConfig) {
        self.config = config;
    }

    /// Update residual method
    pub fn set_residual_method(&mut self, method: ResidualMethod) {
        self.residual_method = method;
    }
}

/// Memory pool for efficient allocation and reuse of vectors
#[derive(Debug)]
pub struct MemoryPool {
    /// Pool of available f64 vectors indexed by size
    f64_pools: std::collections::HashMap<usize, Vec<Vec<f64>>>,
    /// Pool of available usize vectors indexed by size
    usize_pools: std::collections::HashMap<usize, Vec<Vec<usize>>>,
    /// Maximum number of vectors to keep per size
    max_vectors_per_size: usize,
    /// Total memory limit in bytes
    memory_limit: usize,
    /// Current memory usage in bytes
    current_memory_usage: usize,
}

impl MemoryPool {
    /// Create a new memory pool with specified limits
    pub fn new(max_vectors_per_size: usize, memory_limit_mb: usize) -> Self {
        Self {
            f64_pools: std::collections::HashMap::new(),
            usize_pools: std::collections::HashMap::new(),
            max_vectors_per_size,
            memory_limit: memory_limit_mb * 1024 * 1024, // Convert MB to bytes
            current_memory_usage: 0,
        }
    }

    /// Get a vector from the pool or allocate new one
    pub fn get_f64_vector(&mut self, size: usize) -> Vec<f64> {
        if let Some(pool) = self.f64_pools.get_mut(&size)
            && let Some(mut vec) = pool.pop()
        {
            vec.clear();
            vec.resize(size, 0.0);
            return vec;
        }

        // Allocate new vector if none available
        vec![0.0; size]
    }

    /// Return a vector to the pool for reuse
    pub fn return_f64_vector(&mut self, mut vec: Vec<f64>) {
        let size = vec.capacity();
        let memory_size = size * std::mem::size_of::<f64>();

        // Check memory limits
        if self.current_memory_usage + memory_size > self.memory_limit {
            return; // Drop the vector instead of storing it
        }

        let pool = self.f64_pools.entry(size).or_default();
        if pool.len() < self.max_vectors_per_size {
            vec.clear();
            pool.push(vec);
            self.current_memory_usage += memory_size;
        }
    }

    /// Get a usize vector from the pool
    pub fn get_usize_vector(&mut self, size: usize) -> Vec<usize> {
        if let Some(pool) = self.usize_pools.get_mut(&size)
            && let Some(mut vec) = pool.pop()
        {
            vec.clear();
            vec.resize(size, 0);
            return vec;
        }

        vec![0; size]
    }

    /// Return a usize vector to the pool
    pub fn return_usize_vector(&mut self, mut vec: Vec<usize>) {
        let size = vec.capacity();
        let memory_size = size * std::mem::size_of::<usize>();

        if self.current_memory_usage + memory_size > self.memory_limit {
            return;
        }

        let pool = self.usize_pools.entry(size).or_default();
        if pool.len() < self.max_vectors_per_size {
            vec.clear();
            pool.push(vec);
            self.current_memory_usage += memory_size;
        }
    }

    /// Clear all pools and reset memory usage
    pub fn clear(&mut self) {
        self.f64_pools.clear();
        self.usize_pools.clear();
        self.current_memory_usage = 0;
    }

    /// Get current memory usage in bytes
    pub fn memory_usage(&self) -> usize {
        self.current_memory_usage
    }

    /// Get memory usage statistics
    pub fn memory_stats(&self) -> MemoryStats {
        let f64_vectors: usize = self.f64_pools.values().map(|pool| pool.len()).sum();
        let usize_vectors: usize = self.usize_pools.values().map(|pool| pool.len()).sum();

        MemoryStats {
            total_memory_bytes: self.current_memory_usage,
            f64_vectors_pooled: f64_vectors,
            usize_vectors_pooled: usize_vectors,
            f64_pool_sizes: self.f64_pools.len(),
            usize_pool_sizes: self.usize_pools.len(),
        }
    }
}

/// Memory usage statistics
#[derive(Debug, Clone)]
pub struct MemoryStats {
    /// Total memory usage in bytes
    pub total_memory_bytes: usize,
    /// Number of f64 vectors in pools
    pub f64_vectors_pooled: usize,
    /// Number of usize vectors in pools
    pub usize_vectors_pooled: usize,
    /// Number of different f64 vector sizes
    pub f64_pool_sizes: usize,
    /// Number of different usize vector sizes
    pub usize_pool_sizes: usize,
}

/// Communication buffer management for distributed operations
#[derive(Debug)]
pub struct CommBufferManager {
    /// Send buffers for each process
    send_buffers: HashMap<usize, Vec<f64>>,
    /// Receive buffers for each process
    recv_buffers: HashMap<usize, Vec<f64>>,
    /// Buffer for local operations
    local_buffer: Vec<f64>,
    /// Maximum buffer size per process
    max_buffer_size: usize,
    /// Memory pool for buffer allocation
    memory_pool: MemoryPool,
}

impl CommBufferManager {
    /// Create new communication buffer manager
    pub fn new(max_buffer_size: usize, memory_limit_mb: usize) -> Self {
        Self {
            send_buffers: HashMap::new(),
            recv_buffers: HashMap::new(),
            local_buffer: Vec::new(),
            max_buffer_size,
            memory_pool: MemoryPool::new(4, memory_limit_mb / 2), // Half memory for buffers
        }
    }

    /// Get or create send buffer for a process
    pub fn get_send_buffer(&mut self, process: usize, size: usize) -> &mut Vec<f64> {
        let buffer_size = size.min(self.max_buffer_size);
        let buffer = self
            .send_buffers
            .entry(process)
            .or_insert_with(|| self.memory_pool.get_f64_vector(buffer_size));

        if buffer.len() != buffer_size {
            *buffer = self.memory_pool.get_f64_vector(buffer_size);
        }

        buffer
    }

    /// Get or create receive buffer for a process
    pub fn get_recv_buffer(&mut self, process: usize, size: usize) -> &mut Vec<f64> {
        let buffer_size = size.min(self.max_buffer_size);
        let buffer = self
            .recv_buffers
            .entry(process)
            .or_insert_with(|| self.memory_pool.get_f64_vector(buffer_size));

        if buffer.len() != buffer_size {
            *buffer = self.memory_pool.get_f64_vector(buffer_size);
        }

        buffer
    }

    /// Get local buffer for temporary operations
    pub fn get_local_buffer(&mut self, size: usize) -> &mut Vec<f64> {
        let buffer_size = size.min(self.max_buffer_size);
        if self.local_buffer.len() != buffer_size {
            self.local_buffer = self.memory_pool.get_f64_vector(buffer_size);
        }
        &mut self.local_buffer
    }

    /// Clear all buffers
    pub fn clear_buffers(&mut self) {
        for (_, buffer) in self.send_buffers.drain() {
            self.memory_pool.return_f64_vector(buffer);
        }
        for (_, buffer) in self.recv_buffers.drain() {
            self.memory_pool.return_f64_vector(buffer);
        }
        if !self.local_buffer.is_empty() {
            let buffer = std::mem::take(&mut self.local_buffer);
            self.memory_pool.return_f64_vector(buffer);
        }
    }

    /// Get memory statistics
    pub fn memory_stats(&self) -> MemoryStats {
        self.memory_pool.memory_stats()
    }
}

/// Workspace for SuperLU_DIST solve operations
#[derive(Debug)]
pub struct SuperLuDistWorkspace {
    /// Temporary vectors for solve operations
    temp_vectors: HashMap<String, Vec<f64>>,
    /// Communication buffer manager
    comm_buffers: CommBufferManager,
    /// Memory pool for general allocations
    memory_pool: MemoryPool,
    /// Workspace configuration
    config: WorkspaceConfig,
    /// Cached vector sizes for reuse
    vector_sizes: HashMap<String, usize>,
}

/// Configuration for workspace management
#[derive(Debug, Clone)]
pub struct WorkspaceConfig {
    /// Maximum memory limit in MB
    pub memory_limit_mb: usize,
    /// Maximum vectors per size in memory pool
    pub max_vectors_per_size: usize,
    /// Maximum communication buffer size
    pub max_comm_buffer_size: usize,
    /// Enable aggressive memory reuse
    pub aggressive_reuse: bool,
    /// Preallocation strategy
    pub preallocation_strategy: PreallocationStrategy,
}

/// Strategy for preallocating workspace memory
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PreallocationStrategy {
    /// No preallocation
    None,
    /// Preallocate based on matrix size
    MatrixSize,
    /// Preallocate based on process grid
    ProcessGrid,
    /// Preallocate based on block size
    BlockSize,
    /// Full preallocation for maximum performance
    Full,
}

impl Default for WorkspaceConfig {
    fn default() -> Self {
        Self {
            memory_limit_mb: 512, // 512 MB default
            max_vectors_per_size: 8,
            max_comm_buffer_size: 1024 * 1024, // 1M elements
            aggressive_reuse: true,
            preallocation_strategy: PreallocationStrategy::MatrixSize,
        }
    }
}

impl SuperLuDistWorkspace {
    /// Create new workspace with default configuration
    pub fn new() -> Self {
        Self::with_config(WorkspaceConfig::default())
    }

    /// Create workspace with custom configuration
    pub fn with_config(config: WorkspaceConfig) -> Self {
        let memory_pool = MemoryPool::new(
            config.max_vectors_per_size,
            config.memory_limit_mb / 2, // Half for general pool
        );

        let comm_buffers = CommBufferManager::new(
            config.max_comm_buffer_size,
            config.memory_limit_mb / 2, // Half for communication
        );

        Self {
            temp_vectors: HashMap::new(),
            comm_buffers,
            memory_pool,
            config,
            vector_sizes: HashMap::new(),
        }
    }

    /// Setup workspace for a specific matrix and process grid
    pub fn setup_for_problem(
        &mut self,
        matrix_size: usize,
        process_grid: &ProcessGrid,
        block_size: usize,
    ) -> Result<(), KError> {
        // Calculate typical vector sizes needed
        let (local_start, local_end) =
            contiguous_partition(matrix_size, process_grid.my_rank, process_grid.total_procs);
        let local_size = local_end.saturating_sub(local_start);
        let max_local_size = matrix_size.div_ceil(process_grid.total_procs);
        let panel_size = self.config.max_comm_buffer_size.min(block_size * 10);

        // Store sizes for later use
        self.vector_sizes
            .insert("solution".to_string(), matrix_size);
        self.vector_sizes
            .insert("residual".to_string(), matrix_size);
        self.vector_sizes
            .insert("local_work".to_string(), local_size);
        self.vector_sizes
            .insert("local_work_max".to_string(), max_local_size);
        self.vector_sizes
            .insert("panel_work".to_string(), panel_size);
        self.vector_sizes
            .insert("comm_buffer".to_string(), panel_size);

        // Preallocate based on strategy
        match self.config.preallocation_strategy {
            PreallocationStrategy::None => {
                // No preallocation
            }
            PreallocationStrategy::MatrixSize => {
                self.preallocate_vector("solution", matrix_size)?;
                self.preallocate_vector("residual", matrix_size)?;
            }
            PreallocationStrategy::ProcessGrid => {
                self.preallocate_vector("local_work", local_size)?;
                for p in 0..process_grid.total_procs {
                    self.comm_buffers.get_send_buffer(p, panel_size);
                    self.comm_buffers.get_recv_buffer(p, panel_size);
                }
            }
            PreallocationStrategy::BlockSize => {
                self.preallocate_vector("panel_work", panel_size)?;
                self.preallocate_vector("block_work", block_size)?;
            }
            PreallocationStrategy::Full => {
                // Preallocate everything
                self.preallocate_vector("solution", matrix_size)?;
                self.preallocate_vector("residual", matrix_size)?;
                self.preallocate_vector("local_work", local_size)?;
                self.preallocate_vector("panel_work", panel_size)?;
                self.preallocate_vector("block_work", block_size)?;
                for p in 0..process_grid.total_procs {
                    self.comm_buffers.get_send_buffer(p, panel_size);
                    self.comm_buffers.get_recv_buffer(p, panel_size);
                }
            }
        }

        Ok(())
    }

    /// Preallocate a temporary vector
    fn preallocate_vector(&mut self, name: &str, size: usize) -> Result<(), KError> {
        let vector = self.memory_pool.get_f64_vector(size);
        self.temp_vectors.insert(name.to_string(), vector);
        Ok(())
    }

    /// Get a temporary vector for use
    /// Get temporary vector with improved memory management
    pub fn get_temp_vector(&mut self, name: &str, size: usize) -> &mut Vec<f64> {
        // Check if we have a cached size
        let expected_size = self.vector_sizes.get(name).copied().unwrap_or(size);
        let actual_size = size.max(expected_size);

        let vector = self
            .temp_vectors
            .entry(name.to_string())
            .or_insert_with(|| self.memory_pool.get_f64_vector(actual_size));

        // Resize if necessary, but don't grow excessively
        if vector.len() < actual_size {
            vector.resize(actual_size, 0.0);
        } else if vector.len() > actual_size * 2 {
            // Shrink if vector is more than 2x larger than needed
            vector.resize(actual_size, 0.0);
            vector.shrink_to_fit();
        } else {
            // Just clear existing data if size is reasonable
            vector.fill(0.0);
        }

        vector
    }

    /// Cleanup unused vectors to prevent memory bloat
    pub fn cleanup_unused_vectors(&mut self) {
        // Remove vectors that haven't been used recently
        let to_remove: Vec<String> = self
            .temp_vectors
            .keys()
            .filter(|name| !self.vector_sizes.contains_key(*name))
            .cloned()
            .collect();

        for name in to_remove {
            if let Some(vector) = self.temp_vectors.remove(&name) {
                self.memory_pool.return_f64_vector(vector);
            }
        }
    }

    /// Return a temporary vector (for aggressive reuse)
    pub fn return_temp_vector(&mut self, name: &str) {
        if self.config.aggressive_reuse
            && let Some(vector) = self.temp_vectors.remove(name)
        {
            self.memory_pool.return_f64_vector(vector);
        }
        // Otherwise keep the vector allocated for next use
    }

    /// Get communication buffers
    pub fn get_comm_buffers(&mut self) -> &mut CommBufferManager {
        &mut self.comm_buffers
    }

    /// Clear all temporary data
    pub fn clear_temp_data(&mut self) {
        for (_, vector) in self.temp_vectors.drain() {
            self.memory_pool.return_f64_vector(vector);
        }
        self.comm_buffers.clear_buffers();
    }

    /// Get memory usage statistics
    pub fn memory_stats(&self) -> WorkspaceMemoryStats {
        let pool_stats = self.memory_pool.memory_stats();
        let comm_stats = self.comm_buffers.memory_stats();

        let temp_memory: usize = self
            .temp_vectors
            .values()
            .map(|v| v.capacity() * std::mem::size_of::<f64>())
            .sum();

        WorkspaceMemoryStats {
            temp_vectors_memory: temp_memory,
            pool_memory: pool_stats.total_memory_bytes,
            comm_memory: comm_stats.total_memory_bytes,
            total_memory: temp_memory
                + pool_stats.total_memory_bytes
                + comm_stats.total_memory_bytes,
            temp_vectors_count: self.temp_vectors.len(),
            pool_stats,
            comm_stats,
        }
    }

    /// Check if workspace needs cleanup
    pub fn needs_cleanup(&self) -> bool {
        let stats = self.memory_stats();
        let limit_bytes = self.config.memory_limit_mb * 1024 * 1024;
        stats.total_memory > limit_bytes
    }

    /// Perform workspace cleanup
    pub fn cleanup(&mut self) {
        if self.config.aggressive_reuse {
            // Clear temporary vectors but keep communication buffers
            for (_, vector) in self.temp_vectors.drain() {
                self.memory_pool.return_f64_vector(vector);
            }
        } else {
            // Just clear the memory pools
            self.memory_pool.clear();
        }
    }

    /// Optimize workspace for current usage patterns
    pub fn optimize(&mut self) {
        // Remove unused vector size entries
        let active_sizes: std::collections::HashSet<_> =
            self.temp_vectors.values().map(|v| v.capacity()).collect();

        self.vector_sizes
            .retain(|_, &mut size| active_sizes.contains(&size));

        // Trim excess capacity in temporary vectors
        for vector in self.temp_vectors.values_mut() {
            vector.shrink_to_fit();
        }
    }
}

/// Memory statistics for workspace
#[derive(Debug, Clone)]
pub struct WorkspaceMemoryStats {
    /// Memory used by temporary vectors
    pub temp_vectors_memory: usize,
    /// Memory used by memory pool
    pub pool_memory: usize,
    /// Memory used by communication buffers
    pub comm_memory: usize,
    /// Total memory usage
    pub total_memory: usize,
    /// Number of temporary vectors
    pub temp_vectors_count: usize,
    /// Memory pool statistics
    pub pool_stats: MemoryStats,
    /// Communication buffer statistics
    pub comm_stats: MemoryStats,
}

impl Default for SuperLuDistWorkspace {
    fn default() -> Self {
        Self::new()
    }
}

/// SuperLU_DIST distributed direct solver
pub struct SuperLuDistSolver {
    /// Solver options
    options: SuperLuDistOptions,
    /// Internal SuperLU_DIST data (None until first setup)
    data: Option<SuperLuDistData>,
    /// Iterative refinement engine
    refinement_engine: Option<RefinementEngine>,
    /// Workspace configuration
    workspace_config: WorkspaceConfig,
}

/// Builder pattern for configuring SuperLU_DIST solver options
pub struct SuperLuDistBuilder {
    options: SuperLuDistOptions,
    workspace_config: WorkspaceConfig,
    refinement_config: Option<RefinementConfig>,
    residual_method: Option<ResidualMethod>,
}

impl SuperLuDistBuilder {
    /// Create a new builder with default options
    pub fn new() -> Self {
        Self {
            options: SuperLuDistOptions::default(),
            workspace_config: WorkspaceConfig::default(),
            refinement_config: None,
            residual_method: None,
        }
    }

    /// Set the diagonal pivot threshold
    pub fn diagonal_pivot_threshold(mut self, threshold: f64) -> Self {
        self.options.diagonal_pivot_threshold = threshold.clamp(0.0, 1.0);
        self
    }

    /// Set the column permutation strategy
    pub fn column_permutation(mut self, perm: ColumnPermutation) -> Self {
        self.options.column_permutation = perm;
        self
    }

    /// Set the row permutation strategy
    pub fn row_permutation(mut self, perm: RowPermutation) -> Self {
        self.options.row_permutation = perm;
        self
    }

    /// Set the iterative refinement strategy
    pub fn iterative_refinement(mut self, refine: IterativeRefinement) -> Self {
        self.options.iterative_refinement = refine;
        self
    }

    /// Set the print level for diagnostics
    pub fn print_level(mut self, level: u8) -> Self {
        self.options.print_level = level;
        self
    }

    /// Set whether to replace tiny pivots
    pub fn replace_tiny_pivots(mut self, enable: bool) -> Self {
        self.options.replace_tiny_pivots = enable;
        self
    }

    /// Set static pivoting mode
    pub fn static_pivoting(mut self, enable: bool) -> Self {
        self.options.static_pivoting = enable;
        self
    }

    /// Set the process grid dimensions
    pub fn process_grid(mut self, rows: usize, cols: usize) -> Self {
        self.options.process_grid = Some((rows, cols));
        self
    }

    /// Use automatic process grid determination
    pub fn process_grid_auto(mut self) -> Self {
        self.options.process_grid = None;
        self
    }

    /// Set the panel size for local dense factorization
    pub fn panel_size(mut self, size: usize) -> Self {
        self.options.panel_size = Some(size);
        self
    }

    /// Enable 3D communication-avoiding factorization
    pub fn enable_3d_factorization(mut self, enable: bool, depth: Option<usize>) -> Self {
        if enable {
            let d = depth.unwrap_or(0);
            if d < 2 {
                #[cfg(feature = "logging")]
                log::warn!(
                    "process_grid_3d_depth={d} is too small, falling back to 2D factorization"
                );
                self.options.enable_3d_factorization = false;
                self.options.process_grid_3d_depth = None;
            } else {
                self.options.enable_3d_factorization = true;
                self.options.process_grid_3d_depth = Some(d);
            }
        } else {
            self.options.enable_3d_factorization = false;
            self.options.process_grid_3d_depth = depth;
        }
        self
    }

    /// Set memory trade-off factor for 3D algorithm
    pub fn memory_tradeoff_factor(mut self, factor: f64) -> Self {
        self.options.memory_tradeoff_factor = factor.max(0.1);
        self
    }

    /// Set maximum concurrent panels
    pub fn max_concurrent_panels(mut self, max_panels: usize) -> Self {
        self.options.max_concurrent_panels = max_panels.max(1);
        self
    }

    /// Enable asynchronous panel updates
    pub fn async_panel_updates(mut self, enable: bool) -> Self {
        self.options.async_panel_updates = enable;
        self
    }

    /// Set workspace memory limit
    pub fn workspace_memory_limit(mut self, limit_mb: usize) -> Self {
        self.workspace_config.memory_limit_mb = limit_mb;
        self
    }

    /// Enable aggressive memory reuse
    pub fn aggressive_memory_reuse(mut self, enable: bool) -> Self {
        self.workspace_config.aggressive_reuse = enable;
        self
    }

    /// Set workspace preallocation strategy
    pub fn preallocation_strategy(mut self, strategy: PreallocationStrategy) -> Self {
        self.workspace_config.preallocation_strategy = strategy;
        self
    }

    /// Configure iterative refinement
    pub fn refinement_config(mut self, config: RefinementConfig) -> Self {
        self.refinement_config = Some(config);
        self
    }

    /// Set residual computation method
    pub fn residual_method(mut self, method: ResidualMethod) -> Self {
        self.residual_method = Some(method);
        self
    }

    /// Build the SuperLU_DIST solver with configured options
    pub fn build(self) -> SuperLuDistSolver {
        let mut solver = SuperLuDistSolver {
            options: self.options,
            data: None,
            refinement_engine: None,
            workspace_config: self.workspace_config,
        };

        let _ = solver.options.validate(None);

        // Set up refinement engine if configured
        if let Some(config) = self.refinement_config {
            let method = self.residual_method.unwrap_or(ResidualMethod::Standard);
            solver.refinement_engine = Some(RefinementEngine::new(config, method));
        }

        solver
    }
}

impl Default for SuperLuDistBuilder {
    fn default() -> Self {
        Self::new()
    }
}

impl SuperLuDistSolver {
    /// Create a new SuperLU_DIST solver with default options
    pub fn new() -> Self {
        Self {
            options: SuperLuDistOptions::default(),
            data: None,
            refinement_engine: None,
            workspace_config: WorkspaceConfig::default(),
        }
    }

    /// Create a new SuperLU_DIST solver with custom options
    pub fn with_options(options: SuperLuDistOptions) -> Self {
        Self {
            options,
            data: None,
            refinement_engine: None,
            workspace_config: WorkspaceConfig::default(),
        }
    }

    /// Set the diagonal pivot threshold
    pub fn set_diagonal_pivot_threshold(&mut self, threshold: f64) -> &mut Self {
        self.options.diagonal_pivot_threshold = threshold.clamp(0.0, 1.0);
        self
    }

    /// Set the column permutation strategy
    pub fn set_column_permutation(&mut self, perm: ColumnPermutation) -> &mut Self {
        self.options.column_permutation = perm;
        self
    }

    /// Set the iterative refinement strategy
    pub fn set_iterative_refinement(&mut self, refine: IterativeRefinement) -> &mut Self {
        self.options.iterative_refinement = refine;
        self
    }

    /// Set the print level for diagnostics
    pub fn set_print_level(&mut self, level: u8) -> &mut Self {
        self.options.print_level = level;
        self
    }

    /// Set the panel size for local dense factorization
    pub fn set_panel_size(&mut self, size: usize) -> &mut Self {
        self.options.panel_size = Some(size);
        self
    }

    /// Enable 3D communication-avoiding factorization
    pub fn set_3d_factorization(&mut self, enable: bool, depth: Option<usize>) -> &mut Self {
        if enable {
            let d = depth.unwrap_or(0);
            if d < 2 {
                #[cfg(feature = "logging")]
                log::warn!(
                    "process_grid_3d_depth={d} is too small, falling back to 2D factorization"
                );
                self.options.enable_3d_factorization = false;
                self.options.process_grid_3d_depth = None;
            } else {
                self.options.enable_3d_factorization = true;
                self.options.process_grid_3d_depth = Some(d);
            }
        } else {
            self.options.enable_3d_factorization = false;
            self.options.process_grid_3d_depth = depth;
        }
        self
    }

    /// Set memory trade-off factor for 3D algorithm
    pub fn set_memory_tradeoff(&mut self, factor: f64) -> &mut Self {
        self.options.memory_tradeoff_factor = factor.max(0.1);
        self
    }

    /// Set maximum concurrent panels
    pub fn set_max_concurrent_panels(&mut self, max_panels: usize) -> &mut Self {
        self.options.max_concurrent_panels = max_panels.max(1);
        self
    }

    /// Enable asynchronous panel updates
    pub fn set_async_panel_updates(&mut self, enable: bool) -> &mut Self {
        self.options.async_panel_updates = enable;
        self
    }

    /// Set static pivoting mode
    pub fn set_static_pivoting(&mut self, enable: bool) -> &mut Self {
        self.options.static_pivoting = enable;
        self
    }

    /// Set the row permutation strategy
    pub fn set_row_permutation(&mut self, perm: RowPermutation) -> &mut Self {
        self.options.row_permutation = perm;
        self
    }

    /// Set whether to replace tiny pivots
    pub fn set_replace_tiny_pivots(&mut self, enable: bool) -> &mut Self {
        self.options.replace_tiny_pivots = enable;
        self
    }

    /// Set the process grid dimensions
    pub fn set_process_grid(&mut self, rows: usize, cols: usize) -> &mut Self {
        self.options.process_grid = Some((rows, cols));
        self
    }

    /// Use automatic process grid determination
    pub fn set_process_grid_auto(&mut self) -> &mut Self {
        self.options.process_grid = None;
        self
    }

    /// Configure a complete set of options via SuperLuDistOptions
    pub fn with_complete_options(mut self, options: SuperLuDistOptions) -> Self {
        self.options = options;
        self
    }

    /// Create a new solver with fluent configuration
    pub fn builder() -> SuperLuDistBuilder {
        SuperLuDistBuilder::new()
    }

    /// Get a reference to the current options
    pub fn options(&self) -> &SuperLuDistOptions {
        &self.options
    }

    /// Enable iterative refinement with default configuration
    pub fn enable_iterative_refinement(&mut self) -> &mut Self {
        self.refinement_engine = Some(RefinementEngine::with_defaults());
        self
    }

    /// Configure iterative refinement with custom settings
    pub fn set_refinement_config(&mut self, config: RefinementConfig) -> &mut Self {
        if let Some(ref mut engine) = self.refinement_engine {
            engine.set_config(config);
        } else {
            self.refinement_engine = Some(RefinementEngine::new(config, ResidualMethod::Standard));
        }
        self
    }

    /// Set the residual computation method for iterative refinement
    pub fn set_residual_method(&mut self, method: ResidualMethod) -> &mut Self {
        if let Some(ref mut engine) = self.refinement_engine {
            engine.set_residual_method(method);
        } else {
            self.refinement_engine =
                Some(RefinementEngine::new(RefinementConfig::default(), method));
        }
        self
    }

    /// Disable iterative refinement
    pub fn disable_iterative_refinement(&mut self) -> &mut Self {
        self.refinement_engine = None;
        self
    }

    /// Get refinement statistics from the last solve (if available)
    pub fn refinement_stats(&self) -> Option<&RefinementStats> {
        self.refinement_engine
            .as_ref()
            .and_then(|engine| engine.last_stats())
    }

    /// Configure workspace memory settings
    pub fn set_workspace_memory_limit(&mut self, limit_mb: usize) -> &mut Self {
        self.workspace_config.memory_limit_mb = limit_mb;
        self
    }

    /// Enable aggressive memory reuse for better performance
    pub fn set_aggressive_memory_reuse(&mut self, enable: bool) -> &mut Self {
        self.workspace_config.aggressive_reuse = enable;
        self
    }

    /// Set workspace preallocation strategy
    pub fn set_preallocation_strategy(&mut self, strategy: PreallocationStrategy) -> &mut Self {
        self.workspace_config.preallocation_strategy = strategy;
        self
    }

    /// Get workspace memory statistics (if workspace is set up)
    pub fn workspace_memory_stats(&self) -> Option<WorkspaceMemoryStats> {
        self.data
            .as_ref()
            .and_then(|data| data.solve_workspace.as_ref())
            .map(|workspace| workspace.workspace.memory_stats())
    }

    /// Optimize workspace memory usage
    pub fn optimize_workspace(&mut self) -> Result<(), KError> {
        if let Some(ref mut data) = self.data
            && let Some(ref mut solve_workspace) = data.solve_workspace
        {
            solve_workspace.workspace.optimize();
        }
        Ok(())
    }

    /// Clear workspace temporary data to free memory
    pub fn clear_workspace_temp_data(&mut self) -> Result<(), KError> {
        if let Some(ref mut data) = self.data
            && let Some(ref mut solve_workspace) = data.solve_workspace
        {
            solve_workspace.workspace.clear_temp_data();
        }
        Ok(())
    }

    /// Check if workspace needs cleanup due to memory pressure
    pub fn workspace_needs_cleanup(&self) -> bool {
        self.data
            .as_ref()
            .and_then(|data| data.solve_workspace.as_ref())
            .map(|workspace| workspace.workspace.needs_cleanup())
            .unwrap_or(false)
    }

    /// Setup the SuperLU_DIST factorization for the given matrix
    ///
    /// This creates the process grid, distributes the matrix, and performs
    /// symbolic and numerical factorization.
    fn setup_factorization(
        &mut self,
        matrix: &CsrMatrix<f64>,
        comm: &UniverseComm,
    ) -> Result<(), KError> {
        #[cfg(feature = "logging")]
        let _guard = StageGuard::new("SuperLuDistSetup");

        self.options.validate(Some(comm))?;

        let process_grid = if let Some((prows, pcols)) = self.options.process_grid {
            ProcessGrid::new_with_dims(comm, prows, pcols)?
        } else {
            ProcessGrid::new_auto(comm)?
        };
        let row_block_size = matrix.nrows().div_ceil(process_grid.prows).max(1);
        let col_block_size = matrix.ncols().div_ceil(process_grid.pcols).max(1);
        let distribution = BlockCyclicDistribution::new(
            process_grid.clone(),
            matrix.nrows(),
            matrix.ncols(),
            row_block_size,
            col_block_size,
        );

        let local_matrix = matrix.clone();

        // Create SuperLU_DIST data structure
        let mut slu_data = SuperLuDistData {
            process_grid,
            distribution,
            comm: comm.clone(),
            options: self.options.clone(),
            factored: false,
            local_matrix: Some(local_matrix),
            symbolic_factor: None,
            numeric_factor: None,
            solve_workspace: None,
        };

        // Perform symbolic factorization
        #[cfg(feature = "logging")]
        let _symbolic_guard = StageGuard::new("SuperLuDistSymbolic");

        let symbolic = self.symbolic_factorization(&slu_data, comm)?;
        slu_data.symbolic_factor = Some(symbolic);

        // Perform numerical factorization
        #[cfg(feature = "logging")]
        let _numeric_guard = StageGuard::new("SuperLuDistNumeric");

        let numeric = self.numerical_factorization(&slu_data, comm)?;
        slu_data.numeric_factor = Some(numeric);

        // Setup solve workspace
        let workspace = self.setup_solve_workspace(&slu_data)?;
        slu_data.solve_workspace = Some(workspace);

        slu_data.factored = true;
        self.data = Some(slu_data);

        Ok(())
    }

    fn large_diag_row_permutation(
        &self,
        matrix: &CsrMatrix<f64>,
        comm: &UniverseComm,
    ) -> Vec<usize> {
        let n = matrix.nrows();
        let mut diag = vec![0.0f64; n];
        let (row_start, row_end) = contiguous_partition(n, comm.rank(), comm.size());
        let rp = matrix.row_ptr();
        let cj = matrix.col_idx();
        let vv = matrix.values();

        for i in row_start..row_end {
            let mut max_diag = 0.0;
            for idx in rp[i]..rp[i + 1] {
                if cj[idx] == i {
                    let val = vv[idx].abs();
                    if val > max_diag {
                        max_diag = val;
                    }
                }
            }
            diag[i] = max_diag;
        }

        comm.allreduce_sum_slice(&mut diag);

        let mut perm: Vec<usize> = (0..n).collect();
        perm.sort_by(|&a, &b| {
            diag[b]
                .partial_cmp(&diag[a])
                .unwrap_or(Ordering::Equal)
                .then_with(|| a.cmp(&b))
        });
        perm
    }

    fn compute_scaling_factors(
        &self,
        matrix: &CsrMatrix<f64>,
        comm: &UniverseComm,
    ) -> Result<(Vec<f64>, Vec<f64>), KError> {
        let n = matrix.nrows();
        let mut row_max = vec![0.0; n];
        let mut col_max = vec![0.0; n];
        let (row_start, row_end) = contiguous_partition(n, comm.rank(), comm.size());
        let rp = matrix.row_ptr();
        let cj = matrix.col_idx();
        let vv = matrix.values();

        for i in row_start..row_end {
            let mut local_row_max = 0.0;
            for idx in rp[i]..rp[i + 1] {
                let j = cj[idx];
                let val = vv[idx].abs();
                if val > local_row_max {
                    local_row_max = val;
                }
                if val > col_max[j] {
                    col_max[j] = val;
                }
            }
            row_max[i] = local_row_max;
        }

        comm.allreduce_sum_slice(&mut row_max);
        self.allreduce_max_slice(comm, &mut col_max)?;

        let row_scale = row_max
            .into_iter()
            .map(|v| if v > 0.0 { 1.0 / v } else { 1.0 })
            .collect();
        let col_scale = col_max
            .into_iter()
            .map(|v| if v > 0.0 { 1.0 / v } else { 1.0 })
            .collect();

        Ok((row_scale, col_scale))
    }

    fn allreduce_max_slice(
        &self,
        comm: &UniverseComm,
        data: &mut [f64],
    ) -> Result<(), KError> {
        if comm.size() <= 1 {
            return Ok(());
        }

        #[cfg(feature = "mpi")]
        if let Some(world) = comm.as_mpi() {
            let mut recv = vec![0.0; data.len()];
            let rc = unsafe {
                mpi::ffi::MPI_Allreduce(
                    data.as_ptr() as *const c_void,
                    recv.as_mut_ptr() as *mut c_void,
                    recv.len() as i32,
                    mpi::ffi::RSMPI_DOUBLE,
                    mpi::ffi::RSMPI_MAX,
                    world.as_raw(),
                )
            };
            if rc != 0 {
                return Err(KError::SolveError(format!(
                    "MPI_Allreduce (MAX) failed: {rc}"
                )));
            }
            data.copy_from_slice(&recv);
            return Ok(());
        }

        Ok(())
    }

    /// Distribute global matrix to local portions using block-cyclic distribution
    #[allow(dead_code)]
    fn distribute_matrix(
        &self,
        global_matrix: &CsrMatrix<f64>,
        distribution: &BlockCyclicDistribution,
    ) -> Result<CsrMatrix<f64>, KError> {
        let rp = global_matrix.row_ptr();
        let cj = global_matrix.col_idx();
        let vv = global_matrix.values();

        let local_rows = distribution.local_rows;
        let local_cols = distribution.local_cols;
        let mut tmp_cols = vec![Vec::new(); local_rows];
        let mut tmp_vals = vec![Vec::new(); local_rows];

        for i in 0..global_matrix.nrows() {
            for p in rp[i]..rp[i + 1] {
                let j = cj[p];
                if distribution.owner_of(i, j) == distribution.grid.my_rank {
                    let li = distribution.local_row_from_global(i).unwrap();
                    let lj = distribution.local_col_from_global(j).unwrap();
                    tmp_cols[li].push(lj);
                    tmp_vals[li].push(vv[p]);
                }
            }
        }

        let mut local_row_ptrs = Vec::with_capacity(local_rows + 1);
        local_row_ptrs.push(0);
        let mut local_col_indices = Vec::new();
        let mut local_values = Vec::new();
        for r in 0..local_rows {
            local_col_indices.extend_from_slice(&tmp_cols[r]);
            local_values.extend_from_slice(&tmp_vals[r]);
            let last = *local_row_ptrs.last().unwrap();
            local_row_ptrs.push(last + tmp_cols[r].len());
        }

        let local_matrix = CsrMatrix::from_csr(
            local_rows,
            local_cols,
            local_row_ptrs,
            local_col_indices,
            local_values,
        );
        validate_local_csr(&local_matrix)?;
        Ok(local_matrix)
    }

    /// Enhanced symbolic factorization using ordering algorithms
    fn symbolic_factorization(
        &self,
        data: &SuperLuDistData,
        comm: &UniverseComm,
    ) -> Result<SymbolicFactorization, KError> {
        #[cfg(feature = "logging")]
        let _guard = StageGuard::new("SymbolicFactorization");

        let n = data.distribution.global_rows;
        let matrix = data
            .local_matrix
            .as_ref()
            .ok_or_else(|| KError::SolveError("No local/global matrix".to_string()))?;
        assert_eq!(matrix.nrows(), data.distribution.global_rows);
        assert_eq!(matrix.ncols(), data.distribution.global_cols);

        #[cfg(feature = "logging")]
        log::debug!(
            "Symbolic factorization: n={}, nnz={}, col_perm={:?}",
            n,
            matrix.nnz(),
            self.options.column_permutation
        );

        // Compute column permutation based on strategy
        let col_perm = match self.options.column_permutation {
            ColumnPermutation::Natural => OrderingAlgorithms::natural_ordering(n),
            ColumnPermutation::MmdAta => {
                if comm.size() > 1 {
                    OrderingAlgorithms::mmd_ata_ordering_distributed(
                        matrix,
                        comm,
                        &data.distribution,
                    )?
                } else {
                    OrderingAlgorithms::mmd_ata_ordering(matrix)
                }
            }
            ColumnPermutation::Metis => OrderingAlgorithms::metis_ordering(matrix)?,
            ColumnPermutation::ParMetis => {
                OrderingAlgorithms::parmetis_ordering(matrix, comm, &data.distribution)?
            }
            ColumnPermutation::User => {
                // User-provided permutation would be stored in options
                // For now, use natural ordering
                OrderingAlgorithms::natural_ordering(n)
            }
        };

        // Compute row permutation based on strategy
        let row_perm = match self.options.row_permutation {
            RowPermutation::NoRowPerm => OrderingAlgorithms::natural_ordering(n),
            RowPermutation::LargeDiag => self.large_diag_row_permutation(matrix, comm),
            RowPermutation::User => {
                // User-provided permutation would be stored in options
                OrderingAlgorithms::natural_ordering(n)
            }
        };

        #[cfg(feature = "logging")]
        log::debug!("Computing symbolic pattern with {n} x {n} matrix");

        // Compute symbolic factorization pattern
        let l_pattern = SymbolicFactorizer::compute_symbolic_pattern(matrix, &col_perm, &row_perm)?;
        for k in 0..n {
            debug_assert!(l_pattern.contains_key(&(k, k)));
        }

        // Compute U pattern (transpose of L for square matrices)
        let mut u_pattern = HashMap::new();
        for &(i, j) in l_pattern.keys() {
            u_pattern.insert((j, i), true);
        }

        // Build elimination tree
        let etree = SymbolicFactorizer::build_elimination_tree(n, &l_pattern);

        #[cfg(feature = "logging")]
        log::debug!(
            "Symbolic factorization completed: {} L entries, {} U entries",
            l_pattern.len(),
            u_pattern.len()
        );

        Ok(SymbolicFactorization {
            col_perm,
            row_perm,
            etree,
            l_pattern,
            u_pattern,
        })
    }

    /// Enhanced numerical factorization with panel-based approach
    fn numerical_factorization(
        &self,
        data: &SuperLuDistData,
        comm: &UniverseComm,
    ) -> Result<NumericFactorization, KError> {
        #[cfg(feature = "logging")]
        let _guard = StageGuard::new("NumericalFactorization");

        let n = data.distribution.global_rows;
        let matrix = data
            .local_matrix
            .as_ref()
            .ok_or_else(|| KError::SolveError("No local matrix available".to_string()))?;

        let symbolic = data
            .symbolic_factor
            .as_ref()
            .ok_or_else(|| KError::SolveError("No symbolic factorization available".to_string()))?;

        // Determine pivoting strategy from options
        let pivot_strategy = if self.options.static_pivoting {
            PivotingStrategy::Static
        } else if self.options.replace_tiny_pivots {
            PivotingStrategy::ThresholdWithFallback
        } else {
            PivotingStrategy::Dynamic
        };

        // Milestone 1: use a fixed panel size (bounded by matrix dimensions)
        let configured_panel_size = self.options.panel_size.unwrap_or(64).max(1);
        let panel_size = std::cmp::min(configured_panel_size, n.max(1));

        #[cfg(feature = "logging")]
        log::debug!(
            "Starting numerical factorization with panel size {panel_size}, pivot strategy {pivot_strategy:?}"
        );

        let mut panels = Vec::new();
        let mut panel_factors = Vec::new();
        let mut total_row_swaps = 0;
        let mut tiny_pivots_replaced_total = 0usize;
        let mut panels_with_replacements = 0usize;
        let mut max_pivot_growth = 1.0;

        // Process matrix in panels
        for panel_start in (0..n).step_by(panel_size) {
            let panel_end = std::cmp::min(panel_start + panel_size, n);

            // Extract rows that have nonzeros in this panel's columns
            let mut panel_rows = Vec::new();
            let rp = matrix.row_ptr();
            let cj = matrix.col_idx();
            for i in 0..matrix.nrows() {
                let row_start = rp[i];
                let row_end = rp[i + 1];

                for idx in row_start..row_end {
                    let col = cj[idx];
                    if col >= panel_start && col < panel_end {
                        panel_rows.push(i);
                        break;
                    }
                }
            }

            if panel_rows.is_empty() {
                continue; // No nonzeros in this panel
            }

            // Create panel from sparse matrix
            let mut panel = Panel::from_sparse_columns(matrix, panel_start, panel_end, panel_rows);

            // Factorize the panel
            match panel.factorize_lu(self.options.diagonal_pivot_threshold, pivot_strategy) {
                Ok(factor) => {
                    total_row_swaps += factor.num_row_swaps;
                    tiny_pivots_replaced_total += factor.tiny_pivots_replaced;
                    if factor.tiny_pivots_replaced > 0 {
                        panels_with_replacements += 1;
                    }

                    // Estimate pivot growth (simplified)
                    for i in 0..panel.width.min(panel.height) {
                        let diag_val = panel.data[i * panel.height + i].abs();
                        if diag_val > max_pivot_growth {
                            max_pivot_growth = diag_val;
                        }
                    }

                    panel_factors.push(factor);
                }
                Err(e) => {
                    #[cfg(feature = "logging")]
                    log::error!("Panel factorization failed: {e}");
                    return Err(e);
                }
            }

            panels.push(panel);
        }

        // Create global permutations (simplified - in real SuperLU_DIST this would be distributed)
        let global_row_perm = symbolic.row_perm.clone();
        let global_col_perm = symbolic.col_perm.clone();

        // Compute scaling factors using MPI-aware reductions
        let (row_scale, col_scale) = self.compute_scaling_factors(matrix, comm)?;

        // Estimate memory usage
        let memory_usage = panels
            .iter()
            .map(|p| p.data.len() * std::mem::size_of::<f64>())
            .sum::<usize>()
            + (global_row_perm.len() + global_col_perm.len()) * std::mem::size_of::<usize>();

        let factor_stats = FactorizationStats {
            num_panels: panels.len(),
            total_row_swaps,
            tiny_pivots_replaced: tiny_pivots_replaced_total,
            max_pivot_growth,
            condition_estimate: None, // Would require more sophisticated analysis
            memory_usage,
        };

        #[cfg(feature = "logging")]
        if self.options.enabled(1, 1) {
            log::info!(
                "Numerical factorization completed: {} panels, {} row swaps, {} panels replaced {} tiny pivots, max pivot growth {:.2e}",
                factor_stats.num_panels,
                factor_stats.total_row_swaps,
                panels_with_replacements,
                factor_stats.tiny_pivots_replaced,
                factor_stats.max_pivot_growth
            );
        }

        let bs = std::cmp::min(64, n / 4).max(1);
        let nb = n.div_ceil(bs);
        let mut lbg = vec![Vec::<usize>::new(); nb];
        let mut ubg = vec![Vec::<usize>::new(); nb];
        let add_edge = |graph: &mut [Vec<usize>], s: usize, t: usize| {
            if s != t && !graph[s].contains(&t) {
                graph[s].push(t);
            }
        };
        for &(i, j) in symbolic.l_pattern.keys() {
            let bi = i / bs;
            let bj = j / bs;
            if bj < bi {
                add_edge(&mut lbg, bi, bj);
            }
        }
        for &(i, j) in symbolic.u_pattern.keys() {
            let bi = i / bs;
            let bj = j / bs;
            if bj > bi {
                add_edge(&mut ubg, bi, bj);
            }
        }
        for g in [&mut lbg, &mut ubg] {
            for v in g.iter_mut() {
                v.sort_unstable();
            }
        }

        Ok(NumericFactorization {
            n,
            nnz: panels.iter().map(|p| p.data.len()).sum(),
            panels,
            panel_factors,
            global_row_perm,
            global_col_perm,
            row_scale,
            col_scale,
            pivot_strategy,
            pivot_threshold: self.options.diagonal_pivot_threshold,
            replaced_tiny_pivots: tiny_pivots_replaced_total > 0,
            factor_stats,
            l_block_graph: lbg,
            u_block_graph: ubg,
        })
    }

    /// Setup solve workspace
    fn setup_solve_workspace(&self, data: &SuperLuDistData) -> Result<SolveWorkspace, KError> {
        let n = data.distribution.global_rows;

        // Use configured workspace settings
        let mut workspace_config = self.workspace_config.clone();
        let block_size = self.options.panel_size.unwrap_or(64).max(1);
        workspace_config.max_comm_buffer_size =
            (block_size * data.process_grid.total_procs).max(1024);

        let mut workspace = SuperLuDistWorkspace::with_config(workspace_config);

        // Setup workspace for the specific problem
        workspace.setup_for_problem(n, &data.process_grid, block_size)?;

        // Initialize process-specific vectors
        let mut process_vectors = HashMap::new();
        for p in 0..data.process_grid.total_procs {
            let local_rows = data.distribution.local_rows_for_rank(p);
            process_vectors.insert(p, vec![0.0; local_rows]);
        }

        // Initialize global vectors for collective operations
        let mut global_vectors = HashMap::new();
        global_vectors.insert(
            "solution_temp".to_string(),
            vec![0.0; data.distribution.local_rows],
        );
        global_vectors.insert(
            "rhs_temp".to_string(),
            vec![0.0; data.distribution.local_rows],
        );
        global_vectors.insert(
            "column_accum_temp".to_string(),
            vec![0.0; data.distribution.local_cols],
        );
        global_vectors.insert(
            "reduction_temp".to_string(),
            vec![0.0; data.distribution.local_rows],
        );
        if data.process_grid.my_rank == 0 {
            global_vectors.insert("permutation_temp".to_string(), vec![0.0; n]);
        } else {
            global_vectors.insert("permutation_temp".to_string(), Vec::new());
        }

        Ok(SolveWorkspace {
            workspace,
            process_vectors,
            global_vectors,
        })
    }

    /// Distributed solve using the computed factorization
    ///
    /// This corresponds to the HYPRE `hypre_SLUDistSolve` function.
    fn solve_factored(
        &mut self,
        b: &Vec<f64>,
        x: &mut Vec<f64>,
        comm: &UniverseComm,
    ) -> Result<(), KError> {
        let data = self
            .data
            .as_ref()
            .ok_or_else(|| KError::SolveError("SuperLU_DIST not factored".to_string()))?;

        if !data.factored {
            return Err(KError::SolveError("Matrix not factored".to_string()));
        }

        #[cfg(feature = "logging")]
        let _guard = StageGuard::new("SuperLuDistSolve");

        // Copy b to x as starting point
        x.clone_from_slice(b);

        let numeric_factor = data
            .numeric_factor
            .as_ref()
            .ok_or_else(|| KError::SolveError("No numeric factorization available".to_string()))?;

        // Determine communication pattern based on options
        let comm_pattern = CommPattern::PointToPoint;
        let overlap_comm = comm.size() > 1;

        #[cfg(feature = "logging")]
        if self.options.enabled(1, 1) {
            log::info!(
                "Starting distributed triangular solve with pattern {comm_pattern:?}, overlap_comm={overlap_comm}"
            );
        }

        // Phase 1: Forward substitution (solve Ly = Pb)
        // Apply row permutation to RHS
        let row_perm = &numeric_factor.global_row_perm;
        let mut permuted_b = vec![0.0; b.len()];
        for (i, &perm_i) in row_perm.iter().enumerate() {
            if perm_i < b.len() {
                permuted_b[i] = b[perm_i];
            }
        }

        #[cfg(feature = "superlu3d")]
        let grid3d = if self.options.enable_3d_factorization {
            self.options
                .process_grid_3d_depth
                .and_then(|d| ProcessGrid3D::from_2d_with_depth(&data.process_grid, d).ok())
        } else {
            None
        };

        let mut y = vec![0.0; x.len()];
        DistributedTriangularSolver::forward_solve(
            &permuted_b,
            &mut y,
            numeric_factor,
            &data.distribution,
            comm,
            comm_pattern,
            overlap_comm,
            #[cfg(feature = "superlu3d")]
            grid3d.as_ref(),
        )?;

        // Phase 2: Backward substitution (solve Ux = y)
        DistributedTriangularSolver::backward_solve(
            &y,
            x,
            numeric_factor,
            &data.distribution,
            comm,
            comm_pattern,
            overlap_comm,
            #[cfg(feature = "superlu3d")]
            grid3d.as_ref(),
        )?;

        // Apply column permutation to solution
        let col_perm = &numeric_factor.global_col_perm;
        let mut permuted_x = vec![0.0; x.len()];
        for (i, &perm_i) in col_perm.iter().enumerate() {
            if i < x.len() && perm_i < permuted_x.len() {
                permuted_x[perm_i] = x[i];
            }
        }
        x.copy_from_slice(&permuted_x);

        // Apply iterative refinement if requested and engine is available
        if !matches!(
            self.options.iterative_refinement,
            IterativeRefinement::NoRefine
        ) && let Some(ref mut engine) = self.refinement_engine
        {
            // Get the original matrix for residual computation
            let data = self.data.as_ref().unwrap();
            let local_matrix = data.local_matrix.as_ref().ok_or_else(|| {
                KError::SolveError("Local matrix not available for refinement".to_string())
            })?;

            // Perform iterative refinement
            let _refinement_stats = engine.refine_solution(local_matrix, b, x, data, comm)?;

            #[cfg(feature = "logging")]
            if self.options.enabled(1, 1)
                && let Some(stats) = engine.last_stats()
            {
                log::info!(
                    "Iterative refinement completed: {} iterations, final residual: {:.2e}",
                    stats.iterations,
                    stats.final_residual_norm
                );
            }
        }

        #[cfg(feature = "logging")]
        if self.options.enabled(1, 1) {
            log::info!("Distributed triangular solve completed successfully");
        }

        Ok(())
    }

    /// Destroy the factorization and free memory
    pub fn destroy(&mut self) {
        self.data = None;
        self.refinement_engine = None;
    }

    pub fn clear_factors(&mut self) {
        if let Some(d) = &mut self.data {
            d.numeric_factor = None;
            d.factored = false;
        }
    }

    pub fn has_factors(&self) -> bool {
        self.data.as_ref().map(|d| d.factored).unwrap_or(false)
    }
}

impl Default for SuperLuDistSolver {
    fn default() -> Self {
        Self::new()
    }
}

impl Solver<CsrMatrix<f64>> for SuperLuDistSolver {
    type Error = KError;

    fn setup(&mut self, a: &CsrMatrix<f64>, comm: &UniverseComm) -> Result<(), Self::Error> {
        self.options.validate(Some(comm))?;
        if self.data.is_none() {
            self.setup_factorization(a, comm)?;
        }
        Ok(())
    }

    fn factor(&mut self, a: &CsrMatrix<f64>) -> Result<(), Self::Error> {
        {
            let data = self.data.as_ref().ok_or_else(|| {
                KError::SolveError("call setup(&A, &comm) before factor(&A)".into())
            })?;
            if a.nrows() != data.distribution.global_rows
                || a.ncols() != data.distribution.global_cols
            {
                return Err(KError::InvalidInput(
                    "factor(): matrix dims changed since setup".into(),
                ));
            }
        }

        let numeric = {
            let data_ref = self.data.as_ref().ok_or_else(|| {
                KError::SolveError("call setup(&A, &comm) before factor(&A)".into())
            })?;
            self.numerical_factorization(data_ref, &data_ref.comm)?
        };
        if let Some(data_mut) = self.data.as_mut() {
            data_mut.numeric_factor = Some(numeric);
            data_mut.factored = true;
        }
        Ok(())
    }

    fn solve(
        &mut self,
        b: &[f64],
        x: &mut [f64],
        comm: &UniverseComm,
    ) -> Result<SolveStats<f64>, Self::Error> {
        if self.data.is_none() {
            return Err(KError::SolveError(
                "solve() called before setup()/factor()".into(),
            ));
        }
        let data = self.data.as_ref().unwrap();
        if b.len() != data.distribution.global_rows {
            return Err(KError::InvalidInput("RHS size mismatch".into()));
        }
        if x.len() != data.distribution.global_cols {
            return Err(KError::InvalidInput("solution size mismatch".into()));
        }

        let mut xb = x.to_vec();
        self.solve_factored(&b.to_vec(), &mut xb, comm)?;
        x.copy_from_slice(&xb);
        Ok(SolveStats::new(1, 0.0, ConvergedReason::ConvergedAtol))
    }

    fn reuse_factorization(&self) -> bool {
        self.has_factors()
    }
}

impl LinearSolver<CsrMatrix<f64>, Vec<f64>> for SuperLuDistSolver {
    type Error = KError;
    type Scalar = f64;

    fn solve(
        &mut self,
        a: &CsrMatrix<f64>,
        pc: Option<
            &(dyn crate::preconditioner::legacy::Preconditioner<CsrMatrix<f64>, Vec<f64>> + '_),
        >,
        b: &Vec<f64>,
        x: &mut Vec<f64>,
        pc_side: crate::preconditioner::PcSide,
        comm: &crate::parallel::UniverseComm,
        monitors: Option<&[Box<MonitorCallback<Self::Scalar>>]>,
        _work: Option<&mut crate::context::ksp_context::Workspace>,
    ) -> Result<SolveStats<f64>, KError> {
        let _ = pc;
        let _ = pc_side;
        let _ = monitors;
        self.setup(a, comm)?;
        self.factor(a)?;
        <SuperLuDistSolver as Solver<CsrMatrix<f64>>>::solve(
            self,
            b.as_slice(),
            x.as_mut_slice(),
            comm,
        )
    }
}

/// Convenience wrapper to perform a direct SuperLU_DIST solve.
#[cfg(feature = "superlu_dist")]
pub fn solve(
    a: &CsrMatrix<f64>,
    b: &[f64],
    x: &mut [f64],
    comm: &UniverseComm,
) -> Result<(), KError> {
    let mut solver = SuperLuDistSolver::new();
    let mut x_vec = x.to_vec();
    let b_vec = b.to_vec();
    let _ = crate::solver::legacy::LinearSolver::solve(
        &mut solver,
        a,
        None,
        &b_vec,
        &mut x_vec,
        crate::preconditioner::PcSide::Left,
        comm,
        None,
        None,
    )?;
    x.copy_from_slice(&x_vec);
    Ok(())
}

#[cfg(not(feature = "superlu_dist"))]
pub fn solve(
    _a: &crate::matrix::sparse::CsrMatrix<f64>,
    _b: &[f64],
    _x: &mut [f64],
    _comm: &UniverseComm,
) -> Result<(), KError> {
    Err(KError::SolveError(
        "superlu_dist feature not enabled".into(),
    ))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::parallel::NoComm;
    use faer::MatMut;
    use faer::linalg::solvers::{FullPivLu, SolveCore};

    #[test]
    fn test_superlu_dist_creation() {
        let solver = SuperLuDistSolver::new();
        assert!(solver.data.is_none());
        assert_eq!(solver.options.print_level, 0);
        assert_eq!(solver.options.diagonal_pivot_threshold, 1.0);
    }

    #[test]
    fn test_superlu_dist_options() {
        let mut solver = SuperLuDistSolver::new();

        solver
            .set_diagonal_pivot_threshold(0.5)
            .set_column_permutation(ColumnPermutation::Metis)
            .set_iterative_refinement(IterativeRefinement::Single)
            .set_print_level(1);

        assert_eq!(solver.options.diagonal_pivot_threshold, 0.5);
        assert_eq!(solver.options.column_permutation, ColumnPermutation::Metis);
        assert_eq!(
            solver.options.iterative_refinement,
            IterativeRefinement::Single
        );
        assert_eq!(solver.options.print_level, 1);
    }

    #[test]
    fn test_process_grid_determination() {
        assert_eq!(ProcessGrid::determine_optimal_grid(1), (1, 1));
        assert_eq!(ProcessGrid::determine_optimal_grid(4), (2, 2));
        assert_eq!(ProcessGrid::determine_optimal_grid(6), (2, 3));
        assert_eq!(ProcessGrid::determine_optimal_grid(8), (2, 4));
        assert_eq!(ProcessGrid::determine_optimal_grid(16), (4, 4));
    }

    #[test]
    fn test_process_grid_creation() {
        let comm = UniverseComm::NoComm(NoComm);
        let grid = ProcessGrid::new_auto(&comm).unwrap();

        assert_eq!(grid.prows, 1);
        assert_eq!(grid.pcols, 1);
        assert_eq!(grid.my_prow, 0);
        assert_eq!(grid.my_pcol, 0);
        assert_eq!(grid.my_rank, 0);
        assert_eq!(grid.total_procs, 1);
    }

    #[test]
    fn test_block_cyclic_distribution() {
        let comm = UniverseComm::NoComm(NoComm);
        let grid = ProcessGrid::new_auto(&comm).unwrap();

        let distribution = BlockCyclicDistribution::new(
            grid, 10, // global_rows
            10, // global_cols
            4,  // row_block_size
            4,  // col_block_size
        );

        assert_eq!(distribution.global_rows, 10);
        assert_eq!(distribution.global_cols, 10);
        assert_eq!(distribution.local_rows, 10); // All rows on single process
        assert_eq!(distribution.local_cols, 10); // All cols on single process
    }

    #[test]
    fn test_global_to_local_conversion() {
        let comm = UniverseComm::NoComm(NoComm);
        let grid = ProcessGrid::new_auto(&comm).unwrap();

        let distribution = BlockCyclicDistribution::new(grid, 8, 8, 4, 4);

        // For single process, all global indices should map to local indices
        assert_eq!(distribution.global_to_local_row(0), Some(0));
        assert_eq!(distribution.global_to_local_row(3), Some(3));
        assert_eq!(distribution.global_to_local_row(7), Some(7));

        assert_eq!(distribution.global_to_local_col(0), Some(0));
        assert_eq!(distribution.global_to_local_col(3), Some(3));
        assert_eq!(distribution.global_to_local_col(7), Some(7));
    }

    #[test]
    fn distribute_handles_empty() {
        let a = CsrMatrix::from_csr(0, 0, vec![0], vec![], vec![]);
        let grid = ProcessGrid {
            prows: 1,
            pcols: 1,
            my_prow: 0,
            my_pcol: 0,
            my_rank: 0,
            total_procs: 1,
        };
        let dist = BlockCyclicDistribution::new(grid, 0, 0, 4, 4);
        let local = SuperLuDistSolver::new()
            .distribute_matrix(&a, &dist)
            .unwrap();
        assert_eq!(local.nrows(), 0);
        assert_eq!(local.ncols(), 0);
        assert_eq!(local.row_ptr(), &[0]);
    }

    #[test]
    fn distribute_non_square_and_small_blocks() {
        let a = CsrMatrix::from_csr(
            5,
            3,
            vec![0, 1, 2, 2, 3, 3],
            vec![0, 1, 2],
            vec![1.0, 2.0, 3.0],
        );
        let grid = ProcessGrid {
            prows: 2,
            pcols: 2,
            my_prow: 0,
            my_pcol: 0,
            my_rank: 0,
            total_procs: 4,
        };
        let dist = BlockCyclicDistribution::new(grid, 5, 3, 2, 2);
        let local = SuperLuDistSolver::new()
            .distribute_matrix(&a, &dist)
            .unwrap();
        assert!(validate_local_csr(&local).is_ok());
        for &c in local.col_idx() {
            assert!(c < local.ncols());
        }
    }

    #[test]
    fn test_graph_creation() {
        // Create a simple 3x3 tridiagonal matrix
        let matrix = CsrMatrix::from_csr(
            3,
            3,
            vec![0, 2, 4, 6],
            vec![0, 1, 0, 2, 1, 2],
            vec![2.0, -1.0, -1.0, 2.0, -1.0, 2.0],
        );

        let graph = Graph::from_matrix_pattern(&matrix);

        // Check adjacency structure
        assert_eq!(graph.adj[0], vec![1]); // 0 connected to 1
        assert_eq!(graph.adj[1], vec![0, 2]); // 1 connected to 0, 2
        assert_eq!(graph.adj[2], vec![1]); // 2 connected to 1
    }

    #[test]
    fn test_natural_ordering() {
        let perm = OrderingAlgorithms::natural_ordering(5);
        assert_eq!(perm, vec![0, 1, 2, 3, 4]);
    }

    #[test]
    fn test_amd_ordering() {
        // Create a simple matrix for AMD testing
        let matrix = CsrMatrix::from_csr(
            4,
            4,
            vec![0, 2, 4, 6, 8],
            vec![0, 1, 1, 2, 2, 3, 0, 3],
            vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
        );

        let perm = OrderingAlgorithms::amd_ordering(&matrix);

        // Should return a valid permutation
        assert_eq!(perm.len(), 4);
        let mut sorted_perm = perm.clone();
        sorted_perm.sort();
        assert_eq!(sorted_perm, vec![0, 1, 2, 3]);
    }

    #[test]
    fn test_symbolic_factorization_pattern() {
        // Create a simple 3x3 matrix
        let matrix = CsrMatrix::from_csr(
            3,
            3,
            vec![0, 2, 4, 6],
            vec![0, 1, 1, 2, 0, 2],
            vec![2.0, -1.0, 2.0, -1.0, -1.0, 2.0],
        );

        let col_perm = vec![0, 1, 2];
        let row_perm = vec![0, 1, 2];

        let pattern =
            SymbolicFactorizer::compute_symbolic_pattern(&matrix, &col_perm, &row_perm).unwrap();

        // Should have at least diagonal elements
        assert!(pattern.contains_key(&(0, 0)));
        assert!(pattern.contains_key(&(1, 1)));
        assert!(pattern.contains_key(&(2, 2)));
    }

    #[test]
    fn test_elimination_tree_construction() {
        let n = 3;
        let mut l_pattern = HashMap::new();

        // Simple L pattern: lower triangular with some fill
        l_pattern.insert((0, 0), true);
        l_pattern.insert((1, 0), true);
        l_pattern.insert((1, 1), true);
        l_pattern.insert((2, 0), true);
        l_pattern.insert((2, 1), true);
        l_pattern.insert((2, 2), true);

        let etree = SymbolicFactorizer::build_elimination_tree(n, &l_pattern);

        // Check that we have a valid elimination tree
        assert_eq!(etree.parent.len(), n);
        assert_eq!(etree.children.len(), n);
    }

    #[test]
    fn test_enhanced_symbolic_factorization() {
        // Create a simple matrix
        let matrix = CsrMatrix::from_csr(
            3,
            3,
            vec![0, 2, 4, 6],
            vec![0, 1, 1, 2, 0, 2],
            vec![2.0, -1.0, 2.0, -1.0, -1.0, 2.0],
        );

        let comm = UniverseComm::NoComm(NoComm);
        let grid = ProcessGrid::new_auto(&comm).unwrap();
        let distribution = BlockCyclicDistribution::new(grid, 3, 3, 2, 2);

        // Setup solver with different ordering strategies
        let mut solver = SuperLuDistSolver::new();
        solver.set_column_permutation(ColumnPermutation::MmdAta);

        // Create test data
        let slu_data = SuperLuDistData {
            process_grid: distribution.grid.clone(),
            distribution,
            comm: UniverseComm::NoComm(NoComm),
            options: solver.options.clone(),
            factored: false,
            local_matrix: Some(matrix),
            symbolic_factor: None,
            numeric_factor: None,
            solve_workspace: None,
        };

        // Test symbolic factorization
        let comm = UniverseComm::NoComm(NoComm);
        let symbolic = solver.symbolic_factorization(&slu_data, &comm).unwrap();

        // Verify the result
        assert_eq!(symbolic.col_perm.len(), 3);
        assert_eq!(symbolic.row_perm.len(), 3);
        assert!(!symbolic.l_pattern.is_empty());
        assert!(!symbolic.u_pattern.is_empty());
    }

    #[test]
    fn test_panel_creation() {
        // Create a simple 4x4 matrix
        let matrix = CsrMatrix::from_csr(
            4,
            4,
            vec![0, 2, 4, 6, 8],
            vec![0, 1, 1, 2, 2, 3, 0, 3],
            vec![2.0, -1.0, 2.0, -1.0, 2.0, -1.0, -1.0, 2.0],
        );

        let row_indices = vec![0, 1, 2, 3];
        let panel = Panel::from_sparse_columns(&matrix, 0, 2, row_indices);

        assert_eq!(panel.width, 2);
        assert_eq!(panel.height, 4);
        assert_eq!(panel.col_start, 0);
        assert_eq!(panel.data.len(), 8); // 2 columns * 4 rows

        // Check that matrix data was correctly extracted
        assert_eq!(panel.data[0], 2.0); // (0,0)
        assert_eq!(panel.data[1], 0.0); // (1,0) - zero
        assert_eq!(panel.data[4], -1.0); // (0,1)
        assert_eq!(panel.data[5], 2.0); // (1,1)
    }

    #[test]
    fn test_panel_factorization_static() {
        let mut panel = Panel {
            width: 2,
            height: 2,
            data: vec![2.0, 1.0, 1.0, 3.0], // Column-major: [[2,1],[1,3]]
            row_indices: vec![0, 1],
            col_start: 0,
        };

        let result = panel.factorize_lu(1e-12, PivotingStrategy::Static).unwrap();

        assert_eq!(result.pivot_strategy, PivotingStrategy::Static);
        assert_eq!(result.num_row_swaps, 0); // No row swaps in static pivoting
        assert!(!result.is_singular);

        // Check that factorization modified the panel data
        assert_ne!(panel.data, vec![2.0, 1.0, 1.0, 3.0]);
    }

    #[test]
    fn test_panel_factorization_dynamic() {
        let mut panel = Panel {
            width: 2,
            height: 2,
            data: vec![1.0, 3.0, 2.0, 1.0], // Column-major: [[1,2],[3,1]]
            row_indices: vec![0, 1],
            col_start: 0,
        };

        let result = panel
            .factorize_lu(1e-12, PivotingStrategy::Dynamic)
            .unwrap();

        assert_eq!(result.pivot_strategy, PivotingStrategy::Dynamic);
        // Dynamic pivoting should find the larger pivot (3.0) and swap rows
        assert!(result.num_row_swaps > 0 || !result.is_singular);
    }

    #[test]
    fn test_panel_factorization_tiny_pivot() {
        let mut panel = Panel {
            width: 2,
            height: 2,
            data: vec![1e-15, 1.0, 1.0, 3.0], // Very small pivot
            row_indices: vec![0, 1],
            col_start: 0,
        };

        let result = panel.factorize_lu(1e-12, PivotingStrategy::Static).unwrap();

        assert!(result.is_singular); // Should detect singular matrix
        assert_eq!(result.pivot_strategy, PivotingStrategy::Static);

        // Check that tiny pivot was replaced
        assert!(panel.data[0].abs() >= 1e-12);
    }

    #[test]
    fn test_threshold_with_fallback() {
        let mut panel = Panel {
            width: 2,
            height: 2,
            data: vec![1e-15, 1.0, 1.0, 3.0], // Very small pivot
            row_indices: vec![0, 1],
            col_start: 0,
        };

        let result = panel
            .factorize_lu(1e-12, PivotingStrategy::ThresholdWithFallback)
            .unwrap();

        // Should fall back to dynamic pivoting due to tiny pivot
        assert_eq!(result.pivot_strategy, PivotingStrategy::Dynamic);
    }

    #[test]
    fn test_numerical_factorization_integration() {
        // Create a simple symmetric positive definite matrix
        let matrix = CsrMatrix::from_csr(
            3,
            3,
            vec![0, 2, 4, 6],
            vec![0, 1, 1, 2, 0, 2],
            vec![4.0, -1.0, 4.0, -1.0, -1.0, 4.0],
        );

        let comm = UniverseComm::NoComm(NoComm);
        let grid = ProcessGrid::new_auto(&comm).unwrap();
        let distribution = BlockCyclicDistribution::new(grid, 3, 3, 2, 2);

        // Setup solver with static pivoting
        let mut solver = SuperLuDistSolver::new();
        solver.set_static_pivoting(true).set_panel_size(2);

        // Create test data with symbolic factorization
        let symbolic = SymbolicFactorization {
            col_perm: vec![0, 1, 2],
            row_perm: vec![0, 1, 2],
            etree: EliminationTree {
                parent: vec![3, 3, 3],
                children: vec![vec![], vec![], vec![], vec![0, 1, 2]],
                post_order: vec![0, 1, 2],
            },
            l_pattern: [(0, 0), (1, 1), (2, 2)]
                .iter()
                .map(|&k| (k, true))
                .collect(),
            u_pattern: [(0, 0), (1, 1), (2, 2)]
                .iter()
                .map(|&k| (k, true))
                .collect(),
        };

        let slu_data = SuperLuDistData {
            process_grid: distribution.grid.clone(),
            distribution,
            comm: UniverseComm::NoComm(NoComm),
            options: solver.options.clone(),
            factored: false,
            local_matrix: Some(matrix),
            symbolic_factor: Some(symbolic),
            numeric_factor: None,
            solve_workspace: None,
        };

        // Test numerical factorization
        let numeric = solver
            .numerical_factorization(&slu_data, &slu_data.comm)
            .unwrap();

        // Verify the result
        assert_eq!(numeric.n, 3);
        assert_eq!(numeric.pivot_strategy, PivotingStrategy::Static);
        assert!(!numeric.panels.is_empty());
        assert_eq!(numeric.panels.len(), numeric.panel_factors.len());
        assert_eq!(numeric.global_row_perm.len(), 3);
        assert_eq!(numeric.global_col_perm.len(), 3);

        // Check statistics
        assert!(numeric.factor_stats.num_panels > 0);
        assert!(numeric.factor_stats.memory_usage > 0);
        assert!(numeric.factor_stats.max_pivot_growth >= 1.0);
    }

    #[test]
    fn test_3d_factorization_options() {
        let mut solver = SuperLuDistSolver::new();

        // Test 3D factorization configuration
        solver
            .set_3d_factorization(true, Some(2))
            .set_memory_tradeoff(2.5)
            .set_max_concurrent_panels(4)
            .set_async_panel_updates(true);

        let options = solver.options();
        assert!(options.enable_3d_factorization);
        assert_eq!(options.process_grid_3d_depth, Some(2));
        assert_eq!(options.memory_tradeoff_factor, 2.5);
        assert_eq!(options.max_concurrent_panels, 4);
        assert!(options.async_panel_updates);
    }

    #[test]
    fn test_pivoting_strategies() {
        let mut solver = SuperLuDistSolver::new();

        // Test static pivoting
        solver.set_static_pivoting(true);
        assert!(solver.options().static_pivoting);

        // Test threshold settings
        solver.set_diagonal_pivot_threshold(0.1);
        assert_eq!(solver.options().diagonal_pivot_threshold, 0.1);

        // Test panel size setting
        solver.set_panel_size(32);
        assert_eq!(solver.options().panel_size, Some(32));
    }

    #[test]
    fn test_triangular_solve_data_creation() {
        let comm = UniverseComm::NoComm(NoComm);
        let grid = ProcessGrid::new_auto(&comm).unwrap();
        let distribution = BlockCyclicDistribution::new(grid, 8, 8, 2, 2);

        let numeric_factor = NumericFactorization {
            n: 8,
            nnz: 16,
            panels: vec![],
            panel_factors: vec![],
            global_row_perm: (0..8).collect(),
            global_col_perm: (0..8).collect(),
            row_scale: vec![1.0; 8],
            col_scale: vec![1.0; 8],
            pivot_strategy: PivotingStrategy::Static,
            pivot_threshold: 1e-12,
            replaced_tiny_pivots: false,
            factor_stats: FactorizationStats {
                num_panels: 0,
                total_row_swaps: 0,
                tiny_pivots_replaced: 0,
                max_pivot_growth: 1.0,
                condition_estimate: None,
                memory_usage: 0,
            },
            l_block_graph: vec![vec![], vec![]],
            u_block_graph: vec![vec![], vec![]],
        };

        let solve_data =
            TriangularSolveData::new(8, 4, &distribution, &numeric_factor, vec![vec![], vec![]]);

        assert_eq!(solve_data.block_owners.len(), 2); // 8/4 = 2 blocks
        assert_eq!(solve_data.dependency_graph.len(), 2);
        assert!(!solve_data.comm_buffer.is_empty());
    }

    #[test]
    fn diag_owner_is_mod_coords() {
        let grid = ProcessGrid {
            prows: 2,
            pcols: 3,
            my_prow: 0,
            my_pcol: 0,
            my_rank: 0,
            total_procs: 6,
        };
        let dist = BlockCyclicDistribution::new(grid, 128, 128, 4, 4);
        for k in 0..12 {
            let prow = k % dist.grid.prows;
            let pcol = k % dist.grid.pcols;
            let expect = dist.grid.coords_to_rank(prow, pcol);
            assert_eq!(dist.owner_rank_of_diag_block(k), expect);
        }
    }

    #[test]
    fn block_sizes_are_exact() {
        let grid = ProcessGrid {
            prows: 2,
            pcols: 2,
            my_prow: 0,
            my_pcol: 0,
            my_rank: 0,
            total_procs: 4,
        };
        let dist = BlockCyclicDistribution::new(grid, 130, 130, 64, 64);
        let n = 130usize;
        let bs = 64usize;
        let nf = NumericFactorization {
            n,
            nnz: 0,
            panels: vec![],
            panel_factors: vec![],
            global_row_perm: vec![],
            global_col_perm: vec![],
            row_scale: vec![],
            col_scale: vec![],
            pivot_strategy: PivotingStrategy::Static,
            pivot_threshold: 1.0,
            replaced_tiny_pivots: false,
            factor_stats: FactorizationStats {
                num_panels: 0,
                total_row_swaps: 0,
                tiny_pivots_replaced: 0,
                max_pivot_growth: 1.0,
                condition_estimate: None,
                memory_usage: 0,
            },
            l_block_graph: vec![vec![], vec![], vec![]],
            u_block_graph: vec![vec![], vec![], vec![]],
        };
        let t = TriangularSolveData::new(n, bs, &dist, &nf, vec![vec![], vec![], vec![]]);
        assert_eq!(t.block_sizes, vec![64, 64, 2]);
    }

    #[test]
    fn test_communication_patterns() {
        // Test that communication pattern enum works correctly
        assert_eq!(CommPattern::BinaryTree, CommPattern::BinaryTree);
        assert_ne!(CommPattern::BinaryTree, CommPattern::PointToPoint);

        // Test communication request creation
        let request = CommRequest::new(
            1,   // request_id
            0,   // source_rank
            1,   // dest_rank
            100, // tag
            CommType::Send,
            64, // buffer_size
        );

        assert_eq!(request.request_id, 1);
        assert_eq!(request.comm_type, CommType::Send);
    }

    #[test]
    fn l_block_graph_coarsens_symbolic() {
        use std::collections::HashMap;
        let mut lpat = HashMap::new();
        for i in 0..6 {
            for j in 0..=i {
                lpat.insert((i, j), true);
            }
        }
        for &(i, j) in &[(4, 1), (5, 1), (4, 0), (5, 0)] {
            lpat.remove(&(i, j));
        }
        let symbolic = SymbolicFactorization {
            col_perm: (0..6).collect(),
            row_perm: (0..6).collect(),
            etree: EliminationTree {
                parent: vec![6; 6],
                children: vec![vec![]; 6],
                post_order: vec![],
            },
            l_pattern: lpat,
            u_pattern: HashMap::new(),
        };
        let bs = 2;
        let nb = 3;
        let mut lbg = vec![Vec::<usize>::new(); nb];
        let add_edge = |g: &mut [Vec<usize>], s: usize, t: usize| {
            if s != t && !g[s].contains(&t) {
                g[s].push(t);
            }
        };
        for (&(i, j), _) in &symbolic.l_pattern {
            let bi = i / bs;
            let bj = j / bs;
            if bj < bi {
                add_edge(&mut lbg, bi, bj);
            }
        }
        for v in lbg.iter_mut() {
            v.sort_unstable();
        }
        assert_eq!(lbg[0], Vec::<usize>::new());
        assert_eq!(lbg[1], vec![0usize]);
        assert_eq!(lbg[2], vec![1usize]);
    }

    #[test]
    fn test_local_triangular_solve_l() {
        // Panel after in-place LU with unit diagonal L
        // L = [[1, 0], [2, 1]]
        // U = [[1, 4], [0, 5]]
        let panel = Panel {
            width: 2,
            height: 2,
            data: vec![1.0, 2.0, 4.0, 5.0],
            row_indices: vec![0, 1],
            col_start: 0,
        };

        let factors = vec![panel];
        // b = [3,11] -> solution y = [3,5]
        let mut x = vec![3.0, 11.0];
        DistributedTriangularSolver::solve_local_l_block(&mut x, &factors, 0).unwrap();

        assert!((x[0] - 3.0).abs() < 1e-10);
        assert!((x[1] - 5.0).abs() < 1e-10);
    }

    #[test]
    fn test_local_triangular_solve_u() {
        // Create a simple U factor panel
        let panel = Panel {
            width: 2,
            height: 2,
            // Column-major U with L part zero below diagonal
            data: vec![2.0, 0.0, 1.0, 3.0],
            row_indices: vec![0, 1],
            col_start: 0,
        };

        let factors = vec![panel];
        // y = [4,6] corresponds to solution [1,2]
        let mut x = vec![4.0, 6.0];
        DistributedTriangularSolver::solve_local_u_block(&mut x, &factors, 0).unwrap();

        assert!((x[0] - 1.0).abs() < 1e-10);
        assert!((x[1] - 2.0).abs() < 1e-10);
    }

    #[test]
    fn test_block_update_application() {
        let panel = Panel {
            width: 2,
            height: 2,
            // Column-major: [[1, 2], [3, 4]]
            data: vec![1.0, 3.0, 2.0, 4.0],
            row_indices: vec![0, 1],
            col_start: 0,
        };

        let factors = vec![panel];
        let mut x_block = vec![5.0, 7.0];
        let update_data = vec![1.0, 1.0];

        // Apply update: x -= L[:, source_block] * update_data
        // Should subtract column 0 of L (i.e., [1, 3]) * 1 = [1, 3]
        DistributedTriangularSolver::apply_block_update(&mut x_block, &update_data, 0, 0, &factors)
            .unwrap();

        // Result should be [5-1, 7-3] = [4, 4]
        assert!((x_block[0] - 4.0).abs() < 1e-10);
        assert!((x_block[1] - 4.0).abs() < 1e-10);
    }

    #[test]
    fn test_distributed_solve_integration() {
        // Create a simple SPD matrix for testing
        let matrix = CsrMatrix::from_csr(
            4,
            4,
            vec![0, 2, 4, 6, 8],
            vec![0, 1, 1, 2, 2, 3, 0, 3],
            vec![4.0, -1.0, 4.0, -1.0, 4.0, -1.0, -1.0, 4.0],
        );

        let b = vec![1.0, 2.0, 3.0, 4.0];
        let mut x = vec![0.0; 4];
        let mut solver = SuperLuDistSolver::new();

        // Configure for distributed solve
        solver
            .set_async_panel_updates(true)
            .set_3d_factorization(false, None)
            .set_max_concurrent_panels(2);

        let comm = UniverseComm::NoComm(NoComm);
        let stats = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b,
            &mut x,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        )
        .unwrap();

        // Verify solve completed
        assert_eq!(stats.iterations, 1);
        assert!(matches!(stats.reason, ConvergedReason::ConvergedAtol));

        // For a diagonal-dominant matrix, solution should be reasonable
        assert!(x.iter().all(|val: &f64| val.is_finite()));
    }

    #[test]
    fn test_communication_overlap_options() {
        let mut solver = SuperLuDistSolver::new();

        // Test async panel updates
        solver.set_async_panel_updates(true);
        assert!(solver.options().async_panel_updates);

        // Test concurrent panel limits
        solver.set_max_concurrent_panels(8);
        assert_eq!(solver.options().max_concurrent_panels, 8);

        // Test 3D factorization
        solver.set_3d_factorization(true, Some(4));
        assert!(solver.options().enable_3d_factorization);
        assert_eq!(solver.options().process_grid_3d_depth, Some(4));

        // Test memory tradeoff
        solver.set_memory_tradeoff(3.0);
        assert_eq!(solver.options().memory_tradeoff_factor, 3.0);
    }

    #[test]
    fn test_superlu_dist_simple_solve() {
        // 5x5 identity matrix
        let matrix = CsrMatrix::identity(5);

        let b = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let mut x = vec![0.0; 5];
        let mut solver = SuperLuDistSolver::new();

        let comm = UniverseComm::NoComm(NoComm);
        let stats = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b,
            &mut x,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        )
        .unwrap();

        // For identity matrix, solution should equal RHS
        assert_eq!(x, b);
        assert_eq!(stats.iterations, 1);
        assert!(matches!(stats.reason, ConvergedReason::ConvergedAtol));
    }

    #[test]
    #[ignore]
    fn test_superlu_dist_spd_solve() {
        // SPD 3x3 matrix
        let matrix = CsrMatrix::from_csr(
            3,
            3,
            vec![0, 3, 6, 9],
            vec![0, 1, 2, 0, 1, 2, 0, 1, 2],
            vec![4.0, 1.0, 0.0, 1.0, 3.0, 1.0, 0.0, 1.0, 2.0],
        );
        let b = vec![1.0, 2.0, 3.0];
        let mut x = vec![0.0; 3];
        let mut solver = SuperLuDistSolver::new();
        solver
            .set_column_permutation(ColumnPermutation::Natural)
            .set_row_permutation(RowPermutation::NoRowPerm);
        let comm = UniverseComm::NoComm(NoComm);
        let _ = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b,
            &mut x,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        )
        .unwrap();

        let mut x_ref = b.clone();
        let a_dense = matrix.to_dense().unwrap();
        let lu = FullPivLu::new(a_dense.as_ref());
        let x_mat = MatMut::from_column_major_slice_mut(&mut x_ref, 3, 1);
        lu.solve_in_place_with_conj(faer::Conj::No, x_mat);
        for i in 0..3 {
            assert!((x[i] - x_ref[i]).abs() < 1e-8);
        }
    }

    #[test]
    #[ignore]
    fn test_superlu_dist_indefinite_solve() {
        // Matrix [[0,1],[1,0]]
        let matrix = CsrMatrix::from_csr(
            2,
            2,
            vec![0, 2, 4],
            vec![0, 1, 0, 1],
            vec![2.0, 1.0, 1.0, -1.0],
        );
        let b = vec![1.0, 0.0];
        let mut x = vec![0.0; 2];
        let mut solver = SuperLuDistSolver::new();
        solver
            .set_column_permutation(ColumnPermutation::Natural)
            .set_row_permutation(RowPermutation::NoRowPerm);
        let comm = UniverseComm::NoComm(NoComm);
        let _ = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b,
            &mut x,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        )
        .unwrap();
        let mut x_ref = b.clone();
        let a_dense = matrix.to_dense().unwrap();
        let lu = FullPivLu::new(a_dense.as_ref());
        let x_mat = MatMut::from_column_major_slice_mut(&mut x_ref, 2, 1);
        lu.solve_in_place_with_conj(faer::Conj::No, x_mat);
        for i in 0..2 {
            assert!((x[i] - x_ref[i]).abs() < 1e-8);
        }
    }

    fn make_spd6() -> CsrMatrix<f64> {
        let n = 6;
        let mut row_ptr = vec![0];
        let mut col = Vec::new();
        let mut val = Vec::new();
        for i in 0..n {
            if i > 0 {
                col.push(i - 1);
                val.push(-1.0);
            }
            col.push(i);
            val.push(4.0);
            if i + 1 < n {
                col.push(i + 1);
                val.push(-1.0);
            }
            row_ptr.push(col.len());
        }
        CsrMatrix::from_csr(n, n, row_ptr, col, val)
    }

    #[test]
    #[ignore]
    fn test_superlu_dist_random_spd() {
        let matrix = make_spd6();
        let b = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
        let mut x = vec![0.0; 6];
        let mut solver = SuperLuDistSolver::new();
        let comm = UniverseComm::NoComm(NoComm);
        let _ = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b,
            &mut x,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        )
        .unwrap();

        let mut x_ref = b.clone();
        let a_dense = matrix.to_dense().unwrap();
        let lu = FullPivLu::new(a_dense.as_ref());
        let x_mat = MatMut::from_column_major_slice_mut(&mut x_ref, 6, 1);
        lu.solve_in_place_with_conj(faer::Conj::No, x_mat);
        for i in 0..6 {
            assert!((x[i] - x_ref[i]).abs() < 1e-8);
        }
    }

    #[test]
    fn test_superlu_dist_tiny_pivot_replacement() {
        let matrix = CsrMatrix::from_csr(
            2,
            2,
            vec![0, 2, 4],
            vec![0, 1, 0, 1],
            vec![1e-12, 1.0, 1.0, 1.0],
        );
        let b = vec![1.0, 2.0];
        let mut x = vec![0.0; 2];
        let mut solver = SuperLuDistSolver::new();
        solver
            .set_replace_tiny_pivots(true)
            .set_static_pivoting(true)
            .set_diagonal_pivot_threshold(1e-8);
        let comm = UniverseComm::NoComm(NoComm);
        let _ = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b,
            &mut x,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        )
        .unwrap();
        let stats = solver
            .data
            .as_ref()
            .unwrap()
            .numeric_factor
            .as_ref()
            .unwrap()
            .factor_stats
            .tiny_pivots_replaced;
        assert!(stats > 0);
    }

    #[test]
    fn test_invalid_input_dimensions() {
        let matrix =
            CsrMatrix::from_csr(3, 3, vec![0, 1, 2, 3], vec![0, 1, 2], vec![1.0, 1.0, 1.0]);

        let b = vec![1.0, 2.0]; // Wrong size
        let mut x = vec![0.0; 3];
        let mut solver = SuperLuDistSolver::new();

        let comm = UniverseComm::NoComm(NoComm);
        let result = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b,
            &mut x,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        );

        assert!(result.is_err());
        assert!(matches!(result.unwrap_err(), KError::InvalidInput(_)));
    }

    #[test]
    fn test_solver_reuse() {
        let matrix = CsrMatrix::from_csr(2, 2, vec![0, 1, 2], vec![0, 1], vec![2.0, 3.0]);

        let mut solver = SuperLuDistSolver::new();
        let comm = UniverseComm::NoComm(NoComm);

        // First solve
        let b1 = vec![2.0, 3.0];
        let mut x1 = vec![0.0; 2];
        let _stats1 = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b1,
            &mut x1,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        )
        .unwrap();

        // Solver should now have factorization cached
        assert!(solver.data.is_some());

        // Second solve with different RHS
        let b2 = vec![4.0, 6.0];
        let mut x2 = vec![0.0; 2];
        let _stats2 = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b2,
            &mut x2,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        )
        .unwrap();

        // Factorization should be reused
        assert!(solver.data.is_some());
    }

    #[test]
    fn test_refinement_config() {
        let config = RefinementConfig {
            max_iterations: 10,
            tolerance: 1e-14,
            relative_tolerance: 1e-8,
            min_improvement_factor: 0.8,
        };

        let mut solver = SuperLuDistSolver::new();
        solver.set_refinement_config(config.clone());

        assert!(solver.refinement_engine.is_some());
        if let Some(ref engine) = solver.refinement_engine {
            assert_eq!(engine.config.max_iterations, 10);
            assert_eq!(engine.config.tolerance, 1e-14);
            assert_eq!(engine.config.relative_tolerance, 1e-8);
            assert_eq!(engine.config.min_improvement_factor, 0.8);
        }
    }

    #[test]
    fn test_refinement_methods() {
        let mut solver = SuperLuDistSolver::new();

        // Test enabling refinement
        solver.enable_iterative_refinement();
        assert!(solver.refinement_engine.is_some());

        // Test setting residual method
        solver.set_residual_method(ResidualMethod::Scaled);
        if let Some(ref engine) = solver.refinement_engine {
            assert_eq!(engine.residual_method, ResidualMethod::Scaled);
        }

        // Test disabling refinement
        solver.disable_iterative_refinement();
        assert!(solver.refinement_engine.is_none());
    }

    #[test]
    fn test_refinement_engine_creation() {
        let config = RefinementConfig::default();
        let engine = RefinementEngine::new(config, ResidualMethod::ComponentWise);

        assert_eq!(engine.residual_method, ResidualMethod::ComponentWise);
        assert!(engine.last_stats.is_none());
    }

    #[test]
    fn test_refinement_convergence_criteria() {
        let engine = RefinementEngine::with_defaults();

        // Test absolute tolerance convergence
        assert!(engine.check_convergence(1e-13, 1e-6, 1));

        // Test relative tolerance convergence
        assert!(engine.check_convergence(1e-7, 1e-1, 1));

        // Test no convergence
        assert!(!engine.check_convergence(1e-4, 1e-6, 1));

        // Test first iteration never converges
        assert!(!engine.check_convergence(1e-13, 1e-6, 0));
    }

    #[test]
    fn test_distributed_sparse_matvec() {
        let matrix = CsrMatrix::from_csr(
            3,
            3,
            vec![0, 2, 4, 6],
            vec![0, 1, 1, 2, 0, 2],
            vec![2.0, 1.0, 3.0, 1.0, 1.0, 4.0],
        );

        let x = vec![1.0, 2.0, 3.0];
        let mut y = vec![0.0; 3];

        let comm = UniverseComm::NoComm(NoComm);

        RefinementEngine::distributed_sparse_matvec_static(&matrix, &x, &mut y, &comm).unwrap();

        // Expected: [2*1 + 1*2, 3*2 + 1*3, 1*1 + 4*3] = [4, 9, 13]
        assert_eq!(y, vec![4.0, 9.0, 13.0]);
    }

    #[test]
    fn test_refinement_stats() {
        let stats = RefinementStats {
            iterations: 3,
            initial_residual_norm: 1e-3,
            final_residual_norm: 1e-12,
            residual_history: vec![1e-3, 1e-6, 1e-9, 1e-12],
            converged: true,
            convergence_reason: RefinementConvergence::AbsoluteTolerance,
            refinement_time: 0.001,
        };

        assert_eq!(stats.iterations, 3);
        assert!(stats.converged);
        assert_eq!(stats.residual_history.len(), 4);
        assert!(matches!(
            stats.convergence_reason,
            RefinementConvergence::AbsoluteTolerance
        ));
    }

    #[test]
    fn test_residual_methods() {
        // Test all residual method variants
        assert_eq!(ResidualMethod::Standard, ResidualMethod::Standard);
        assert_ne!(ResidualMethod::Standard, ResidualMethod::Scaled);
        assert_ne!(ResidualMethod::Scaled, ResidualMethod::ComponentWise);
    }

    #[test]
    fn test_refinement_workspace_setup() {
        let mut engine = RefinementEngine::with_defaults();
        let n = 100;

        engine.setup_workspace(n);

        assert_eq!(engine.residual_workspace.len(), n);
        assert_eq!(engine.correction_workspace.len(), n);
        assert_eq!(engine.matvec_workspace.len(), n);
    }

    #[test]
    fn test_vector_norm_computation() {
        let comm = UniverseComm::NoComm(NoComm);

        let vector = vec![3.0, 4.0, 0.0];
        let norm = RefinementEngine::compute_vector_norm_static(&vector, &comm).unwrap();

        // Expected norm: sqrt(9 + 16 + 0) = 5.0
        assert!((norm - 5.0).abs() < 1e-10);
    }

    #[test]
    fn test_refinement_convergence_variants() {
        // Test all convergence reason variants
        let reasons = [
            RefinementConvergence::AbsoluteTolerance,
            RefinementConvergence::RelativeTolerance,
            RefinementConvergence::MaxIterations,
            RefinementConvergence::Stagnation,
            RefinementConvergence::Divergence,
        ];

        for (i, reason1) in reasons.iter().enumerate() {
            for (j, reason2) in reasons.iter().enumerate() {
                if i == j {
                    assert_eq!(reason1, reason2);
                } else {
                    assert_ne!(reason1, reason2);
                }
            }
        }
    }

    #[test]
    fn test_iterative_refinement_integration() {
        // Test complete integration with solver and refinement
        let _matrix = CsrMatrix::from_csr(
            3,
            3,
            vec![0, 2, 4, 6],
            vec![0, 1, 1, 2, 0, 2],
            vec![2.0, 1.0, 3.0, 1.0, 1.0, 4.0],
        );

        let _b = vec![6.0, 8.0, 10.0];
        let _x = vec![0.0; 3];

        // Create solver with iterative refinement enabled
        let mut solver = SuperLuDistSolver::new();
        solver.enable_iterative_refinement();

        // Configure refinement settings
        let config = RefinementConfig {
            max_iterations: 3,
            tolerance: 1e-10,
            relative_tolerance: 1e-8,
            min_improvement_factor: 0.9,
        };
        solver.set_refinement_config(config);
        solver.set_residual_method(ResidualMethod::Standard);

        let _comm = UniverseComm::NoComm(NoComm);

        // Since we don't have full matrix factorization in the test environment,
        // just verify that refinement engine is properly configured
        assert!(solver.refinement_engine.is_some());

        if let Some(ref engine) = solver.refinement_engine {
            assert_eq!(engine.config.max_iterations, 3);
            assert_eq!(engine.config.tolerance, 1e-10);
            assert_eq!(engine.residual_method, ResidualMethod::Standard);
        }

        // Test that refinement stats are initially None
        assert!(solver.refinement_stats().is_none());
    }

    #[test]
    fn test_refinement_residual_scaling() {
        // Test different residual scaling methods
        let matrix = CsrMatrix::from_csr(2, 2, vec![0, 1, 2], vec![0, 1], vec![1.0, 1.0]);

        let rhs = vec![2.0, 3.0];
        let solution = vec![1.0, 1.0];
        let mut residual = vec![0.0; 2];
        let mut matvec_workspace = vec![0.0; 2];
        let comm = UniverseComm::NoComm(NoComm);

        // Test standard residual
        RefinementEngine::compute_residual_static(
            &matrix,
            &rhs,
            &solution,
            &mut residual,
            &mut matvec_workspace,
            ResidualMethod::Standard,
            &comm,
        )
        .unwrap();
        // Expected: rhs - matrix*solution = [2,3] - [1,1] = [1,2]
        assert_eq!(residual, vec![1.0, 2.0]);

        // Test scaled residual
        RefinementEngine::compute_residual_static(
            &matrix,
            &rhs,
            &solution,
            &mut residual,
            &mut matvec_workspace,
            ResidualMethod::Scaled,
            &comm,
        )
        .unwrap();
        // Should be scaled by ||rhs|| = sqrt(4+9) = sqrt(13)
        let rhs_norm = (4.0 + 9.0_f64).sqrt();
        assert!((residual[0] - 1.0 / rhs_norm).abs() < 1e-10);
        assert!((residual[1] - 2.0 / rhs_norm).abs() < 1e-10);
    }

    #[test]
    fn test_memory_pool_basic_operations() {
        let mut pool = MemoryPool::new(4, 1024 * 1024); // 1MB limit, max 4 vectors per size

        // Test getting vectors
        let vec1 = pool.get_f64_vector(100);
        assert_eq!(vec1.len(), 100);

        let vec2 = pool.get_usize_vector(50);
        assert_eq!(vec2.len(), 50);

        // Test returning vectors
        pool.return_f64_vector(vec1);
        pool.return_usize_vector(vec2);

        // Test reuse
        let vec3 = pool.get_f64_vector(100);
        assert_eq!(vec3.len(), 100);

        let stats = pool.memory_stats();
        assert!(stats.f64_vectors_pooled > 0 || stats.f64_vectors_pooled == 0); // May be 0 if reused
    }

    #[test]
    fn test_memory_pool_size_limits() {
        let mut pool = MemoryPool::new(2, 1024); // Very small limit

        // Add vectors within limit
        let vec1 = pool.get_f64_vector(10);
        pool.return_f64_vector(vec1);

        let vec2 = pool.get_f64_vector(10);
        pool.return_f64_vector(vec2);

        let stats = pool.memory_stats();
        assert!(stats.total_memory_bytes <= 1024);
    }

    #[test]
    fn test_comm_buffer_manager() {
        let mut manager = CommBufferManager::new(1000, 10); // 10MB limit

        // Test getting buffers
        let send_buf = manager.get_send_buffer(0, 100);
        assert_eq!(send_buf.len(), 100);

        let recv_buf = manager.get_recv_buffer(1, 200);
        assert_eq!(recv_buf.len(), 200);

        let local_buf = manager.get_local_buffer(50);
        assert_eq!(local_buf.len(), 50);

        // Test buffer reuse
        let send_buf2 = manager.get_send_buffer(0, 100);
        assert_eq!(send_buf2.len(), 100);

        manager.clear_buffers();
        let stats = manager.memory_stats();
        // After clearing, memory usage should be minimal
        assert!(stats.total_memory_bytes < 1024 * 1024); // Less than 1MB
    }

    #[test]
    fn test_superlu_dist_workspace_creation() {
        let workspace = SuperLuDistWorkspace::new();
        let stats = workspace.memory_stats();

        // New workspace should have minimal memory usage
        assert_eq!(stats.temp_vectors_count, 0);
        assert_eq!(stats.total_memory, 0);
    }

    #[test]
    fn test_workspace_config_variants() {
        // Test all preallocation strategies
        let strategies = [
            PreallocationStrategy::None,
            PreallocationStrategy::MatrixSize,
            PreallocationStrategy::ProcessGrid,
            PreallocationStrategy::BlockSize,
            PreallocationStrategy::Full,
        ];

        for strategy in strategies {
            let config = WorkspaceConfig {
                preallocation_strategy: strategy,
                ..Default::default()
            };

            let workspace = SuperLuDistWorkspace::with_config(config);
            let _stats = workspace.memory_stats();
            // Just test that workspace can be created with different strategies
        }
    }

    #[test]
    fn test_workspace_temp_vector_management() {
        let mut workspace = SuperLuDistWorkspace::new();

        // Test getting temporary vectors
        let vec1 = workspace.get_temp_vector("test_vec", 100);
        assert_eq!(vec1.len(), 100);
        vec1[0] = 42.0;

        // Test that vector is cleared on next access
        let vec2 = workspace.get_temp_vector("test_vec", 100);
        assert_eq!(vec2.len(), 100);
        assert_eq!(vec2[0], 0.0); // Should be cleared

        // Test returning vectors
        workspace.return_temp_vector("test_vec");

        let stats = workspace.memory_stats();
        assert!(stats.temp_vectors_count <= 1); // May have been returned to pool
    }

    #[test]
    fn test_workspace_setup_for_problem() {
        let mut workspace = SuperLuDistWorkspace::new();

        // Create a simple process grid
        let comm = UniverseComm::NoComm(NoComm);
        let process_grid = ProcessGrid::new_auto(&comm).unwrap();

        // Setup workspace for a problem
        workspace
            .setup_for_problem(1000, &process_grid, 64)
            .unwrap();

        let stats = workspace.memory_stats();
        // Should have preallocated some memory based on default strategy
        assert!(stats.total_memory > 0);
    }

    #[test]
    fn test_workspace_optimization() {
        let mut workspace = SuperLuDistWorkspace::new();

        // Add some temporary vectors
        workspace.get_temp_vector("temp1", 100);
        workspace.get_temp_vector("temp2", 200);

        let stats_before = workspace.memory_stats();

        // Optimize workspace
        workspace.optimize();

        let stats_after = workspace.memory_stats();
        // Optimization should not increase memory usage
        assert!(stats_after.total_memory <= stats_before.total_memory);
    }

    #[test]
    fn test_solver_workspace_configuration() {
        let mut solver = SuperLuDistSolver::new();

        // Test configuring workspace settings
        solver
            .set_workspace_memory_limit(2048)
            .set_aggressive_memory_reuse(true)
            .set_preallocation_strategy(PreallocationStrategy::MatrixSize);

        // Configuration should be stored
        assert_eq!(solver.workspace_config.memory_limit_mb, 2048);
        assert_eq!(solver.workspace_config.aggressive_reuse, true);
        assert_eq!(
            solver.workspace_config.preallocation_strategy,
            PreallocationStrategy::MatrixSize
        );
    }

    #[test]
    fn test_workspace_memory_stats() {
        let mut solver = SuperLuDistSolver::new();
        solver.set_workspace_memory_limit(512);

        // Memory stats should be None before setup
        assert!(solver.workspace_memory_stats().is_none());

        // After setting up (if we had a complete solve), stats would be available
        // For now, just test that the method exists and returns None
        assert!(solver.workspace_memory_stats().is_none());
    }

    #[test]
    fn test_workspace_cleanup_detection() {
        let solver = SuperLuDistSolver::new();

        // Before setup, should not need cleanup
        assert!(!solver.workspace_needs_cleanup());

        // Test cleanup methods exist
        let mut solver_mut = solver;
        assert!(solver_mut.optimize_workspace().is_ok());
        assert!(solver_mut.clear_workspace_temp_data().is_ok());
    }

    #[test]
    fn test_workspace_memory_efficiency() {
        let mut workspace = SuperLuDistWorkspace::with_config(WorkspaceConfig {
            memory_limit_mb: 1, // Very small limit
            aggressive_reuse: true,
            preallocation_strategy: PreallocationStrategy::None,
            ..Default::default()
        });

        // Test that workspace respects memory limits
        workspace.get_temp_vector("small1", 10);
        workspace.get_temp_vector("small2", 10);

        let stats = workspace.memory_stats();
        let limit_bytes = 1024 * 1024; // 1MB

        // Should be well under the limit for small vectors
        assert!(stats.total_memory < limit_bytes);

        // Test cleanup
        workspace.clear_temp_data();
        let stats_after = workspace.memory_stats();
        assert!(stats_after.total_memory <= stats.total_memory);
    }

    #[test]
    fn test_solve_workspace_integration() {
        // Test the enhanced SolveWorkspace structure
        let workspace = SuperLuDistWorkspace::new();
        let process_vectors = HashMap::new();
        let global_vectors = HashMap::new();

        let solve_workspace = SolveWorkspace {
            workspace,
            process_vectors,
            global_vectors,
        };

        // Test workspace is properly constructed
        let _stats = solve_workspace.workspace.memory_stats();
        // Just test that the workspace structure is valid
        assert!(true);
    }

    #[test]
    fn test_superlu_dist_builder_pattern() {
        let solver = SuperLuDistSolver::builder()
            .diagonal_pivot_threshold(0.2)
            .column_permutation(ColumnPermutation::Metis)
            .row_permutation(RowPermutation::LargeDiag)
            .iterative_refinement(IterativeRefinement::Double)
            .print_level(1)
            .replace_tiny_pivots(true)
            .static_pivoting(false)
            .process_grid(2, 2)
            .panel_size(32)
            .enable_3d_factorization(false, None)
            .memory_tradeoff_factor(1.5)
            .max_concurrent_panels(2)
            .async_panel_updates(true)
            .workspace_memory_limit(1024)
            .aggressive_memory_reuse(true)
            .preallocation_strategy(PreallocationStrategy::MatrixSize)
            .build();

        assert_eq!(solver.options.diagonal_pivot_threshold, 0.2);
        assert_eq!(solver.options.column_permutation, ColumnPermutation::Metis);
        assert_eq!(solver.options.row_permutation, RowPermutation::LargeDiag);
        assert_eq!(
            solver.options.iterative_refinement,
            IterativeRefinement::Double
        );
        assert_eq!(solver.options.print_level, 1);
        assert_eq!(solver.options.replace_tiny_pivots, true);
        assert_eq!(solver.options.static_pivoting, false);
        assert_eq!(solver.options.process_grid, Some((2, 2)));
        assert_eq!(solver.options.panel_size, Some(32));
        assert_eq!(solver.options.enable_3d_factorization, false);
        assert_eq!(solver.options.memory_tradeoff_factor, 1.5);
        assert_eq!(solver.options.max_concurrent_panels, 2);
        assert_eq!(solver.options.async_panel_updates, true);
        assert_eq!(solver.workspace_config.memory_limit_mb, 1024);
        assert_eq!(solver.workspace_config.aggressive_reuse, true);
        assert_eq!(
            solver.workspace_config.preallocation_strategy,
            PreallocationStrategy::MatrixSize
        );
    }

    #[test]
    fn test_superlu_dist_fluent_configuration() {
        let mut solver = SuperLuDistSolver::new();

        solver
            .set_diagonal_pivot_threshold(0.3)
            .set_column_permutation(ColumnPermutation::ParMetis)
            .set_row_permutation(RowPermutation::NoRowPerm)
            .set_iterative_refinement(IterativeRefinement::Single)
            .set_print_level(2)
            .set_replace_tiny_pivots(false)
            .set_static_pivoting(true)
            .set_process_grid(4, 1)
            .set_panel_size(64)
            .set_3d_factorization(true, Some(2))
            .set_memory_tradeoff(2.0)
            .set_max_concurrent_panels(4)
            .set_async_panel_updates(false)
            .set_workspace_memory_limit(2048)
            .set_aggressive_memory_reuse(false)
            .set_preallocation_strategy(PreallocationStrategy::ProcessGrid);

        assert_eq!(solver.options.diagonal_pivot_threshold, 0.3);
        assert_eq!(
            solver.options.column_permutation,
            ColumnPermutation::ParMetis
        );
        assert_eq!(solver.options.row_permutation, RowPermutation::NoRowPerm);
        assert_eq!(
            solver.options.iterative_refinement,
            IterativeRefinement::Single
        );
        assert_eq!(solver.options.print_level, 2);
        assert_eq!(solver.options.replace_tiny_pivots, false);
        assert_eq!(solver.options.static_pivoting, true);
        assert_eq!(solver.options.process_grid, Some((4, 1)));
        assert_eq!(solver.options.panel_size, Some(64));
        assert_eq!(solver.options.enable_3d_factorization, true);
        assert_eq!(solver.options.process_grid_3d_depth, Some(2));
        assert_eq!(solver.options.memory_tradeoff_factor, 2.0);
        assert_eq!(solver.options.max_concurrent_panels, 4);
        assert_eq!(solver.options.async_panel_updates, false);
        assert_eq!(solver.workspace_config.memory_limit_mb, 2048);
        assert_eq!(solver.workspace_config.aggressive_reuse, false);
        assert_eq!(
            solver.workspace_config.preallocation_strategy,
            PreallocationStrategy::ProcessGrid
        );
    }

    #[test]
    fn test_superlu_dist_builder_with_refinement() {
        let refinement_config = RefinementConfig {
            max_iterations: 5,
            tolerance: 1e-10,
            relative_tolerance: 1e-8,
            min_improvement_factor: 0.95,
        };

        let solver = SuperLuDistSolver::builder()
            .diagonal_pivot_threshold(0.1)
            .iterative_refinement(IterativeRefinement::Double)
            .refinement_config(refinement_config)
            .residual_method(ResidualMethod::Scaled)
            .build();

        assert!(solver.refinement_engine.is_some());
        assert_eq!(
            solver.options.iterative_refinement,
            IterativeRefinement::Double
        );

        if let Some(ref engine) = solver.refinement_engine {
            let config = engine.config();
            assert_eq!(config.max_iterations, 5);
            assert_eq!(config.tolerance, 1e-10);
            assert_eq!(config.relative_tolerance, 1e-8);
            assert_eq!(config.min_improvement_factor, 0.95);
        }
    }

    #[test]
    fn test_superlu_dist_auto_process_grid() {
        let mut solver = SuperLuDistSolver::new();

        // Test setting explicit process grid
        solver.set_process_grid(2, 3);
        assert_eq!(solver.options.process_grid, Some((2, 3)));

        // Test setting to auto
        solver.set_process_grid_auto();
        assert_eq!(solver.options.process_grid, None);
    }

    #[test]
    fn test_superlu_dist_complete_options_replacement() {
        let new_options = SuperLuDistOptions {
            process_grid: Some((1, 4)),
            column_permutation: ColumnPermutation::Natural,
            diagonal_pivot_threshold: 0.01,
            replace_tiny_pivots: false,
            iterative_refinement: IterativeRefinement::Extra,
            print_level: 3,
            static_pivoting: true,
            row_permutation: RowPermutation::User,
            panel_size: Some(128),
            enable_3d_factorization: true,
            process_grid_3d_depth: Some(4),
            memory_tradeoff_factor: 3.0,
            max_concurrent_panels: 8,
            async_panel_updates: true,
        };

        let solver = SuperLuDistSolver::new().with_complete_options(new_options.clone());

        assert_eq!(solver.options.process_grid, new_options.process_grid);
        assert_eq!(
            solver.options.column_permutation,
            new_options.column_permutation
        );
        assert_eq!(
            solver.options.diagonal_pivot_threshold,
            new_options.diagonal_pivot_threshold
        );
        assert_eq!(
            solver.options.replace_tiny_pivots,
            new_options.replace_tiny_pivots
        );
        assert_eq!(
            solver.options.iterative_refinement,
            new_options.iterative_refinement
        );
        assert_eq!(solver.options.print_level, new_options.print_level);
        assert_eq!(solver.options.static_pivoting, new_options.static_pivoting);
        assert_eq!(solver.options.row_permutation, new_options.row_permutation);
        assert_eq!(solver.options.panel_size, new_options.panel_size);
        assert_eq!(
            solver.options.enable_3d_factorization,
            new_options.enable_3d_factorization
        );
        assert_eq!(
            solver.options.process_grid_3d_depth,
            new_options.process_grid_3d_depth
        );
        assert_eq!(
            solver.options.memory_tradeoff_factor,
            new_options.memory_tradeoff_factor
        );
        assert_eq!(
            solver.options.max_concurrent_panels,
            new_options.max_concurrent_panels
        );
        assert_eq!(
            solver.options.async_panel_updates,
            new_options.async_panel_updates
        );
    }

    #[test]
    fn test_superlu_dist_linear_solver_error_handling() {
        use crate::parallel::{NoComm, UniverseComm};

        let matrix = CsrMatrix::from_csr(
            2,
            2, // Square matrix
            vec![0, 1, 2],
            vec![0, 1],
            vec![1.0, 1.0],
        );

        let b = vec![1.0, 2.0];
        let mut x = vec![0.0, 0.0]; // Match RHS size
        let comm = UniverseComm::NoComm(NoComm);

        let mut solver = SuperLuDistSolver::new();

        // Initially no factorization should exist
        assert!(solver.data.is_none());

        // This should succeed and automatically set up factorization
        let result = crate::solver::legacy::LinearSolver::solve(
            &mut solver,
            &matrix,
            None,
            &b,
            &mut x,
            crate::preconditioner::PcSide::Left,
            &comm,
            None,
            None,
        );
        assert!(result.is_ok());

        // After solve, factorization should exist
        assert!(solver.data.is_some());
    }

    #[test]
    fn test_superlu_dist_builder_defaults() {
        let builder = SuperLuDistBuilder::new();
        let solver = builder.build();

        // Check that default values are properly set
        assert_eq!(solver.options.diagonal_pivot_threshold, 1.0);
        assert_eq!(solver.options.column_permutation, ColumnPermutation::MmdAta);
        assert_eq!(solver.options.row_permutation, RowPermutation::LargeDiag);
        assert_eq!(
            solver.options.iterative_refinement,
            IterativeRefinement::Double
        );
        assert_eq!(solver.options.print_level, 0);
        assert_eq!(solver.options.replace_tiny_pivots, false);
        assert_eq!(solver.options.static_pivoting, false);
        assert_eq!(solver.options.process_grid, None);
        assert_eq!(solver.options.panel_size, None);
        assert_eq!(solver.options.enable_3d_factorization, false);
        assert_eq!(solver.options.process_grid_3d_depth, None);
        assert_eq!(solver.options.memory_tradeoff_factor, 1.0);
        assert_eq!(solver.options.max_concurrent_panels, 1);
        assert_eq!(solver.options.async_panel_updates, false);
        assert!(solver.refinement_engine.is_none());
    }

    #[test]
    #[cfg(feature = "superlu3d")]
    fn test_request_schedule_3d() {
        use crate::parallel::NoComm;
        let comm = UniverseComm::NoComm(NoComm);
        let grid2d = ProcessGrid {
            prows: 1,
            pcols: 1,
            my_prow: 0,
            my_pcol: 0,
            my_rank: 0,
            total_procs: 1,
        };
        let dist = BlockCyclicDistribution::new(grid2d.clone(), 0, 0, 1, 1);
        let grid3d = ProcessGrid3D {
            prows: 1,
            pcols: 1,
            pdepth: 3,
            my_prow: 0,
            my_pcol: 0,
            my_pdepth: 0,
            my_rank: 0,
            total_procs: 1,
        };
        let mut solve_data = TriangularSolveData {
            local_solution_blocks: vec![],
            comm_buffer: vec![],
            pending_requests: vec![],
            block_owners: vec![0],
            block_sizes: vec![],
            local_l_factors: vec![],
            local_u_factors: vec![],
            dependency_graph: vec![],
        };
        let block_id = 5usize;
        DistributedTriangularSolver::start_nonblocking_broadcast(
            &mut solve_data,
            &[0.0],
            block_id,
            &dist,
            CommPattern::PointToPoint,
            &comm,
            Some(&grid3d),
        )
        .unwrap();
        let mut tags: Vec<usize> = solve_data
            .pending_requests
            .iter()
            .map(|r| r.meta.tag)
            .collect();
        tags.sort_unstable();
        let expected = vec![(block_id << 8) + 1, (block_id << 8) + 2];
        assert_eq!(tags, expected);
    }
}

#[cfg(test)]
mod send_sync_checks {
    use super::*;

    fn assert_send<T: Send>() {}
    #[allow(dead_code)]
    fn assert_send_sync<T: Send + Sync>() {}

    #[test]
    fn superlu_types_send() {
        assert_send::<SuperLuDistSolver>();
        assert_send::<SuperLuDistData>();
        assert_send::<SuperLuDistWorkspace>();
        assert_send::<MemoryPool>();
        assert_send::<CommBufferManager>();
    }

    // Uncomment if Sync should be guaranteed in future.
    // #[test]
    // fn superlu_types_send_sync() {
    //     assert_send_sync::<SuperLuDistSolver>();
    //     assert_send_sync::<SuperLuDistWorkspace>();
    // }
}