arael 0.3.0 - Docs.rs

//! Levenberg-Marquardt solver with dense, band, and sparse backends.
//!
//! The solver minimizes a sum-of-squares cost function using the
//! Gauss-Newton approximation with adaptive LM damping. Problems
//! implement `LmProblem` (usually auto-generated by `#[arael(root)]`),
//! and backends implement `LmSolver`.
//!
//! This is intentionally a simple, predictable solver. A more
//! sophisticated implementation could use adaptive strategies such as
//! gain-ratio-based lambda updates, trust region methods, or even
//! machine-learned parameter selection and termination criteria.
//! The current design favors transparency and ease of tuning via
//! [`LmConfig`](crate::simple_lm::LmConfig) over automatic sophistication.
//!
//! # Example
//!
//! ```ignore
//! // After defining your model with #[arael(root)]:
//! let mut params = Vec::new();
//! path.serialize64(&mut params);
//!
//! let config = arael::simple_lm::LmConfig {
//!     verbose: true,
//!     ..Default::default()
//! };
//!
//! // Dense solver:
//! let result = arael::simple_lm::solve(&params, &mut path, &config);
//!
//! // Band solver (for block-tridiagonal systems, kd = half-bandwidth):
//! let result = arael::simple_lm::solve_band(&params, 11, &mut path, &config);
//!
//! // Sparse solver (faer, pure Rust):
//! let result = arael::simple_lm::solve_sparse_faer(&params, &mut path, &config);
//!
//! path.deserialize64(&result.x);
//! println!("cost: {} -> {} in {} iterations",
//!     result.start_cost, result.end_cost, result.iterations);
//! ```

use crate::utils::Float;
use std::fmt;
#[cfg(target_arch = "wasm32")]
use web_time::Instant;
#[cfg(not(target_arch = "wasm32"))]
use std::time::Instant;

/// Format a float like C's `%g` — 6 significant digits, uses the shorter of
/// fixed or scientific notation, strips trailing zeros.
struct G<T>(T);

impl fmt::Display for G<f64> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let v = self.0;
        if v == 0.0 {
            return f.write_str("0");
        }
        // C's %g: use %e when exponent < -4 or >= precision (6)
        let exp = v.abs().log10().floor() as i32;
        if exp < -4 || exp >= 6 {
            // Scientific: 6 significant digits means precision = 5 after the leading digit
            let s = format!("{:.5e}", v);
            // Trim trailing zeros in mantissa
            if let Some(pos) = s.find('e') {
                let (mantissa, exponent) = s.split_at(pos);
                let trimmed = mantissa.trim_end_matches('0').trim_end_matches('.');
                write!(f, "{}{}", trimmed, exponent)
            } else {
                f.write_str(&s)
            }
        } else {
            // Fixed: number of decimal places = max(0, 5 - exp)
            let decimals = (5 - exp).max(0) as usize;
            let s = format!("{:.*}", decimals, v);
            if s.contains('.') {
                let trimmed = s.trim_end_matches('0').trim_end_matches('.');
                f.write_str(trimmed)
            } else {
                f.write_str(&s)
            }
        }
    }
}

impl fmt::Display for G<f32> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}", G(self.0 as f64))
    }
}


/// Fixed-size dispatch threshold. Systems up to this size use compile-time
/// fixed-size matrices for efficiency; larger systems use dynamic allocation.
pub const FIXED_SIZE_THRESHOLD: usize = 9;

// ---------------------------------------------------------------------------
// LM solver types
// ---------------------------------------------------------------------------

/// Configuration for the Levenberg-Marquardt solver.
/// Configuration for the Levenberg-Marquardt solver.
///
/// Choosing good parameters depends heavily on the problem structure:
///
/// - **Well-conditioned, near minimum** (e.g. re-solving after a small
///   perturbation): use small `initial_lambda` (1e-6), tight precision, few
///   iterations. The Gauss-Newton step is nearly optimal.
///
/// - **Far from minimum** (e.g. first solve from a rough initial guess):
///   use larger `initial_lambda` (1e-3 or higher) so early steps are more
///   gradient-descent-like and don't overshoot. Increase `max_iters`.
///
/// - **Ill-conditioned / many local minima** (e.g. SLAM with outliers):
///   use graduated optimization (multiple passes with increasing constraint
///   weights) rather than tuning LM parameters. The robust cost function
///   handles outliers; LM just needs enough iterations per pass.
///
/// - **Interactive / real-time** (e.g. sketch editor drag): use small
///   `initial_lambda` (1e-6), moderate `max_iters` (50-100), loose precision.
///   Speed matters more than the last decimal of convergence.
///
/// The defaults are a reasonable middle ground. For critical applications,
/// experiment with the parameters on representative problem instances.
pub struct LmConfig<T> {
    /// Absolute cost improvement threshold. A step counts as "small" when
    /// the cost decrease is below this AND the relative improvement is also
    /// below `rel_precision`. After `patience` consecutive small steps, the
    /// solver terminates.
    pub abs_precision: T,
    /// Relative cost improvement threshold. A step counts as "small" when
    /// `(old_cost - new_cost) / old_cost < rel_precision`. This prevents
    /// premature termination when the cost is small but still improving
    /// proportionally (e.g. cost 0.01 -> 0.009 is a 10% improvement).
    pub rel_precision: T,
    /// Maximum number of LM iterations (including inner damping retries).
    pub max_iters: usize,
    /// Minimum iterations before termination is allowed. Ensures the solver
    /// makes at least this many accepted steps before checking convergence.
    pub min_iters: usize,
    /// Number of consecutive "small improvement" steps before stopping.
    /// Higher values make the solver more conservative about terminating.
    pub patience: usize,
    /// Initial damping parameter lambda. Small values (1e-6) behave like
    /// Gauss-Newton (fast near minimum, may overshoot far away). Large
    /// values (1e-1) behave like gradient descent (slow but stable). The
    /// solver adapts lambda automatically during the solve.
    pub initial_lambda: T,
    /// Stop immediately when cost drops to or below this value. Set to 0.0
    /// to disable (only terminate via precision/patience). Useful when you
    /// know the target cost (e.g. constraint satisfaction where 0 is perfect).
    pub cost_threshold: T,
    /// Print per-iteration cost, lambda, and timing to stderr. Very useful
    /// for understanding how the solver behaves with a given parameter set --
    /// check the output to validate convergence and tune the config.
    pub verbose: bool,
}

impl<T: Float> Default for LmConfig<T> {
    fn default() -> Self {
        LmConfig {
            abs_precision: T::from(1e-6).unwrap(),
            rel_precision: T::from(1e-4).unwrap(),
            max_iters: 100,
            min_iters: 5,
            patience: 3,
            initial_lambda: T::from(1e-4).unwrap(),
            cost_threshold: T::zero(),
            verbose: false,
        }
    }
}

// ---------------------------------------------------------------------------
// LmProblem trait — object implementing cost + gradient/hessian
// ---------------------------------------------------------------------------

/// Error when a block's indices exceed the declared bandwidth.
#[derive(Debug)]
pub struct BandError {
    /// Row index of the offending element.
    pub row: usize,
    /// Column index of the offending element.
    pub col: usize,
    /// Declared half-bandwidth that was exceeded.
    pub kd: usize,
}

impl std::fmt::Display for BandError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "band overflow: element ({}, {}) exceeds bandwidth kd={}", self.row, self.col, self.kd)
    }
}

/// Interface for optimization problems.
///
/// Provides cost evaluation and gradient/Hessian assembly in various matrix
/// formats (dense, band, COO sparse, CSC sparse, indexed sparse).
pub trait LmProblem<T> {
    /// Evaluate the total cost at the given parameters.
    fn calc_cost(&mut self, params: &[T]) -> T;
    /// Assemble gradient and dense row-major Hessian (n*n).
    fn calc_grad_hessian_dense(&mut self, params: &[T], grad: &mut [T], hessian: &mut [T]);
    /// Assemble gradient and Hessian in upper-band format (column-major, (kd+1)*n).
    /// Returns Err if any block exceeds the declared bandwidth kd.
    fn calc_grad_hessian_band(&mut self, params: &[T], grad: &mut [T], band: &mut [T], kd: usize) -> Result<(), BandError>;
    /// Assemble gradient and upper-triangle Hessian as COO triplets.
    fn calc_grad_hessian_sparse(&mut self, params: &[T], grad: &mut [T], coo: &mut CooMatrix<T>);
    /// Assemble gradient and accumulate Hessian directly into CSC vals (structure must be pre-built).
    fn calc_grad_hessian_sparse_direct(&mut self, params: &[T], grad: &mut [T], csc: &mut CscMatrix<T>);
    /// Assemble gradient and accumulate Hessian into CSC vals using a precomputed position map.
    fn calc_grad_hessian_sparse_indexed(&mut self, params: &[T], grad: &mut [T], vals: &mut [T], positions: &[usize]);
    /// Called after each accepted LM step to let the problem update internal state.
    /// Default: no-op.
    fn advance(&mut self, _params: &mut [T]) {}
}

/// Adapter: wrap two closures into an LmProblem (dense only).
pub struct FnProblem<F1, F2> {
    /// Closure computing the scalar cost from parameters.
    pub cost: F1,
    /// Closure assembling gradient and dense Hessian from parameters.
    pub grad_hessian: F2,
}

impl<T, F1, F2> LmProblem<T> for FnProblem<F1, F2>
where
    F1: FnMut(&[T]) -> T,
    F2: FnMut(&[T], &mut [T], &mut [T]),
{
    fn calc_cost(&mut self, params: &[T]) -> T {
        (self.cost)(params)
    }
    fn calc_grad_hessian_dense(&mut self, params: &[T], grad: &mut [T], hessian: &mut [T]) {
        (self.grad_hessian)(params, grad, hessian)
    }
    fn calc_grad_hessian_band(&mut self, _params: &[T], _grad: &mut [T], _band: &mut [T], _kd: usize) -> Result<(), BandError> {
        unimplemented!("FnProblem does not support band assembly")
    }
    fn calc_grad_hessian_sparse(&mut self, _params: &[T], _grad: &mut [T], _coo: &mut CooMatrix<T>) {
        unimplemented!("FnProblem does not support sparse assembly")
    }
    fn calc_grad_hessian_sparse_direct(&mut self, _params: &[T], _grad: &mut [T], _csc: &mut CscMatrix<T>) {
        unimplemented!("FnProblem does not support sparse direct assembly")
    }
    fn calc_grad_hessian_sparse_indexed(&mut self, _params: &[T], _grad: &mut [T], _vals: &mut [T], _positions: &[usize]) {
        unimplemented!("FnProblem does not support sparse indexed assembly")
    }
}

/// Result returned by the Levenberg-Marquardt solver.
pub struct LmResult<T> {
    /// Final parameter vector.
    pub x: Vec<T>,
    /// Cost at the initial parameters.
    pub start_cost: T,
    /// Cost at the final parameters.
    pub end_cost: T,
    /// Total number of iterations performed (including inner damping retries).
    pub iterations: usize,
}

// ---------------------------------------------------------------------------
// Cholesky solve — nalgebra fixed-size, dispatch by runtime n
// ---------------------------------------------------------------------------

fn cholesky_solve<const N: usize>(a: &[f64], b: &[f64], x: &mut [f64]) -> bool {
    let mat = nalgebra::SMatrix::<f64, N, N>::from_row_slice(a);
    let rhs = nalgebra::SVector::<f64, N>::from_column_slice(b);
    match nalgebra::linalg::Cholesky::new(mat) {
        Some(chol) => {
            let sol = chol.solve(&rhs);
            x.copy_from_slice(sol.as_slice());
            true
        }
        None => false,
    }
}

fn cholesky_solve_f32<const N: usize>(a: &[f32], b: &[f32], x: &mut [f32]) -> bool {
    let mat = nalgebra::SMatrix::<f32, N, N>::from_row_slice(a);
    let rhs = nalgebra::SVector::<f32, N>::from_column_slice(b);
    match nalgebra::linalg::Cholesky::new(mat) {
        Some(chol) => {
            let sol = chol.solve(&rhs);
            x.copy_from_slice(sol.as_slice());
            true
        }
        None => false,
    }
}

macro_rules! dispatch_cholesky {
    ($solve_fn:ident, $n:expr, $a:expr, $b:expr, $x:expr) => {
        match $n {
            1 => $solve_fn::<1>($a, $b, $x),
            2 => $solve_fn::<2>($a, $b, $x),
            3 => $solve_fn::<3>($a, $b, $x),
            4 => $solve_fn::<4>($a, $b, $x),
            5 => $solve_fn::<5>($a, $b, $x),
            6 => $solve_fn::<6>($a, $b, $x),
            7 => $solve_fn::<7>($a, $b, $x),
            8 => $solve_fn::<8>($a, $b, $x),
            9 => $solve_fn::<9>($a, $b, $x),
            _ => unreachable!(),
        }
    };
}

fn cholesky_solve_dynamic(n: usize, a: &[f64], b: &[f64], x: &mut [f64]) -> bool {
    let mat = nalgebra::DMatrix::from_row_slice(n, n, a);
    let rhs = nalgebra::DVector::from_column_slice(b);
    match nalgebra::linalg::Cholesky::new(mat) {
        Some(chol) => {
            let sol = chol.solve(&rhs);
            x.copy_from_slice(sol.as_slice());
            true
        }
        None => false,
    }
}

fn cholesky_solve_dynamic_f32(n: usize, a: &[f32], b: &[f32], x: &mut [f32]) -> bool {
    let mat = nalgebra::DMatrix::from_row_slice(n, n, a);
    let rhs = nalgebra::DVector::from_column_slice(b);
    match nalgebra::linalg::Cholesky::new(mat) {
        Some(chol) => {
            let sol = chol.solve(&rhs);
            x.copy_from_slice(sol.as_slice());
            true
        }
        None => false,
    }
}

/// Solve A*x = b for symmetric positive definite A (f64) via Cholesky decomposition.
/// Uses fixed-size nalgebra for n <= 9, dynamic allocation otherwise.
/// Returns false if A is not positive definite.
pub fn solve_spd(n: usize, a: &[f64], b: &[f64], x: &mut [f64]) -> bool {
    if n <= 9 {
        dispatch_cholesky!(cholesky_solve, n, a, b, x)
    } else {
        cholesky_solve_dynamic(n, a, b, x)
    }
}

/// Solve A*x = b for symmetric positive definite A (f32) via Cholesky decomposition.
/// Uses fixed-size nalgebra for n <= 9, dynamic allocation otherwise.
/// Returns false if A is not positive definite.
pub fn solve_spd_f32(n: usize, a: &[f32], b: &[f32], x: &mut [f32]) -> bool {
    if n <= 9 {
        dispatch_cholesky!(cholesky_solve_f32, n, a, b, x)
    } else {
        cholesky_solve_dynamic_f32(n, a, b, x)
    }
}

// ---------------------------------------------------------------------------
// LmSolver trait — abstract over matrix storage and linear solve
// ---------------------------------------------------------------------------

/// Abstract linear algebra backend for the LM solver.
///
/// Assembles and solves the damped normal equations using a specific matrix
/// storage format (dense, band, sparse, etc.).
pub trait LmSolver<T: Float> {
    /// Associated matrix storage type (e.g. Vec for dense, SparseMatrix for sparse).
    type Matrix;

    /// Create a zero-initialized matrix for n parameters.
    fn new_matrix(&self, n: usize) -> Self::Matrix;

    /// Compute gradient and Hessian matrix from the problem.
    fn compute(&self, problem: &mut dyn LmProblem<T>, params: &[T], grad: &mut [T], matrix: &mut Self::Matrix);

    /// Extract all diagonal elements from the matrix.
    fn extract_diagonal(&self, matrix: &Self::Matrix, diagonal: &mut [T]);

    /// Apply LM damping and solve: sets diagonal to (1+lambda)*saved_diag, then solves.
    /// Returns false if the system is not positive definite.
    fn solve_damped(&mut self, n: usize, matrix: &mut Self::Matrix, diagonal: &[T], lambda: T, grad: &[T], delta: &mut [T]) -> bool;
}

/// Dense Cholesky solver (nalgebra).
pub struct Dense;

impl LmSolver<f64> for Dense {
    type Matrix = Vec<f64>;
    fn new_matrix(&self, n: usize) -> Vec<f64> { vec![0.0; n * n] }
    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], hessian: &mut Vec<f64>) {
        problem.calc_grad_hessian_dense(params, grad, hessian);
    }
    fn extract_diagonal(&self, matrix: &Vec<f64>, diagonal: &mut [f64]) {
        let n = diagonal.len();
        for i in 0..n { diagonal[i] = matrix[i * n + i]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut Vec<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        for i in 0..n { matrix[i * n + i] = (1.0 + lambda) * diagonal[i]; }
        solve_spd(n, matrix, grad, delta)
    }
}

impl LmSolver<f32> for Dense {
    type Matrix = Vec<f32>;
    fn new_matrix(&self, n: usize) -> Vec<f32> { vec![0.0; n * n] }
    fn compute(&self, problem: &mut dyn LmProblem<f32>, params: &[f32], grad: &mut [f32], hessian: &mut Vec<f32>) {
        problem.calc_grad_hessian_dense(params, grad, hessian);
    }
    fn extract_diagonal(&self, matrix: &Vec<f32>, diagonal: &mut [f32]) {
        let n = diagonal.len();
        for i in 0..n { diagonal[i] = matrix[i * n + i]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut Vec<f32>, diagonal: &[f32], lambda: f32, grad: &[f32], delta: &mut [f32]) -> bool {
        for i in 0..n { matrix[i * n + i] = (1.0 + lambda) * diagonal[i]; }
        solve_spd_f32(n, matrix, grad, delta)
    }
}

/// Band Cholesky solver (pure Rust).
pub struct Band {
    /// Half-bandwidth (number of superdiagonals).
    pub kd: usize,
}

impl LmSolver<f64> for Band {
    type Matrix = Vec<f64>;
    fn new_matrix(&self, n: usize) -> Vec<f64> { vec![0.0; (self.kd + 1) * n] }
    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], band: &mut Vec<f64>) {
        problem.calc_grad_hessian_band(params, grad, band, self.kd)
            .expect("band assembly failed: element outside bandwidth");
    }
    fn extract_diagonal(&self, matrix: &Vec<f64>, diagonal: &mut [f64]) {
        let ldab = self.kd + 1;
        for i in 0..diagonal.len() { diagonal[i] = matrix[self.kd + i * ldab]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut Vec<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        // Band Cholesky destroys the matrix, so we must copy
        let ldab = self.kd + 1;
        let mut buf = matrix.clone();
        for i in 0..n { buf[self.kd + i * ldab] = (1.0 + lambda) * diagonal[i]; }
        delta.copy_from_slice(grad);
        solve_spd_band(n, self.kd, &mut buf, delta)
    }
}

impl LmSolver<f32> for Band {
    type Matrix = Vec<f32>;
    fn new_matrix(&self, n: usize) -> Vec<f32> { vec![0.0; (self.kd + 1) * n] }
    fn compute(&self, problem: &mut dyn LmProblem<f32>, params: &[f32], grad: &mut [f32], band: &mut Vec<f32>) {
        problem.calc_grad_hessian_band(params, grad, band, self.kd)
            .expect("band assembly failed: element outside bandwidth");
    }
    fn extract_diagonal(&self, matrix: &Vec<f32>, diagonal: &mut [f32]) {
        let ldab = self.kd + 1;
        for i in 0..diagonal.len() { diagonal[i] = matrix[self.kd + i * ldab]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut Vec<f32>, diagonal: &[f32], lambda: f32, grad: &[f32], delta: &mut [f32]) -> bool {
        let ldab = self.kd + 1;
        let mut buf = matrix.clone();
        for i in 0..n { buf[self.kd + i * ldab] = (1.0 + lambda) * diagonal[i]; }
        delta.copy_from_slice(grad);
        solve_spd_band_f32(n, self.kd, &mut buf, delta)
    }
}

/// Band Cholesky solver using LAPACK dpbsv/spbsv.
#[cfg(feature = "lapack")]
pub struct BandLapack {
    /// Half-bandwidth (number of superdiagonals).
    pub kd: usize,
}

#[cfg(feature = "lapack")]
impl LmSolver<f64> for BandLapack {
    type Matrix = Vec<f64>;
    fn new_matrix(&self, n: usize) -> Vec<f64> { vec![0.0; (self.kd + 1) * n] }
    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], band: &mut Vec<f64>) {
        problem.calc_grad_hessian_band(params, grad, band, self.kd)
            .expect("band assembly failed: element outside bandwidth");
    }
    fn extract_diagonal(&self, matrix: &Vec<f64>, diagonal: &mut [f64]) {
        let ldab = self.kd + 1;
        for i in 0..diagonal.len() { diagonal[i] = matrix[self.kd + i * ldab]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut Vec<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        let ldab = self.kd + 1;
        let mut buf = matrix.clone();
        for i in 0..n { buf[self.kd + i * ldab] = (1.0 + lambda) * diagonal[i]; }
        delta.copy_from_slice(grad);
        solve_spd_band_lapack(n, self.kd, &mut buf, delta)
    }
}

#[cfg(feature = "lapack")]
impl LmSolver<f32> for BandLapack {
    type Matrix = Vec<f32>;
    fn new_matrix(&self, n: usize) -> Vec<f32> { vec![0.0; (self.kd + 1) * n] }
    fn compute(&self, problem: &mut dyn LmProblem<f32>, params: &[f32], grad: &mut [f32], band: &mut Vec<f32>) {
        problem.calc_grad_hessian_band(params, grad, band, self.kd)
            .expect("band assembly failed: element outside bandwidth");
    }
    fn extract_diagonal(&self, matrix: &Vec<f32>, diagonal: &mut [f32]) {
        let ldab = self.kd + 1;
        for i in 0..diagonal.len() { diagonal[i] = matrix[self.kd + i * ldab]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut Vec<f32>, diagonal: &[f32], lambda: f32, grad: &[f32], delta: &mut [f32]) -> bool {
        let ldab = self.kd + 1;
        let mut buf = matrix.clone();
        for i in 0..n { buf[self.kd + i * ldab] = (1.0 + lambda) * diagonal[i]; }
        delta.copy_from_slice(grad);
        solve_spd_band_lapack_f32(n, self.kd, &mut buf, delta)
    }
}

// ---------------------------------------------------------------------------
// Generic LM solver
// ---------------------------------------------------------------------------

/// Run Levenberg-Marquardt optimization with the given solver backend.
pub fn lm_solve<T: Float, S: LmSolver<T>>(
    x0: &[T],
    solver: &mut S,
    problem: &mut impl LmProblem<T>,
    config: &LmConfig<T>,
) -> LmResult<T> {
    let n = x0.len();
    if n == 0 {
        return LmResult {
            x: x0.to_vec(), start_cost: T::zero(), end_cost: T::zero(), iterations: 0,
        };
    }

    let mut cur_x = x0.to_vec();
    let mut try_x = vec![T::zero(); n];
    let mut grad = vec![T::zero(); n];
    let mut matrix = solver.new_matrix(n);
    let mut diagonal = vec![T::zero(); n];
    let mut delta = vec![T::zero(); n];

    let lambda_floor = T::from(1e-12).unwrap();
    let lambda_ceiling = T::from(1e10).unwrap();
    let lambda_down = T::from(0.2).unwrap();
    let lambda_up = T::from(10.0).unwrap();
    let eight = T::from(8.0).unwrap();

    let mut lambda = config.initial_lambda;
    let start_cost = problem.calc_cost(&cur_x);
    let mut end_cost = start_cost;

    // Already at zero cost — nothing to do
    if start_cost == T::zero() {
        return LmResult { x: cur_x, start_cost, end_cost, iterations: 0 };
    }

    let mut small_count = 0usize;
    let mut done = false;
    let mut iter = 0usize;
    let mut timer = Instant::now();

    while iter < config.max_iters && !done {
        solver.compute(problem, &cur_x, &mut grad, &mut matrix);
        solver.extract_diagonal(&matrix, &mut diagonal);

        let mut inner = 0usize;
        const INNER_LOOPS: usize = 20;

        while inner < INNER_LOOPS && iter < config.max_iters {
            iter += 1;

            if !solver.solve_damped(n, &mut matrix, &diagonal, lambda, &grad, &mut delta) {
                lambda = lambda * lambda_up;
                inner += 1;
                continue;
            }

            for i in 0..n {
                try_x[i] = cur_x[i] - delta[i];
            }

            let new_cost = problem.calc_cost(&try_x);

            if config.verbose {
                let step_us = timer.elapsed().as_micros();
                timer = Instant::now();
                eprintln!("{}/{}: {}->{} / {}, lambda={} (step={})",
                    iter, inner,
                    G(end_cost.to_f64().unwrap()), G(new_cost.to_f64().unwrap()),
                    G((end_cost - new_cost).to_f64().unwrap()), G(lambda.to_f64().unwrap()),
                    step_us);
            }

            // At machine precision, cost differences are just rounding noise
            let at_precision = new_cost.is_finite()
                && (end_cost - new_cost).abs() < eight * T::epsilon() * end_cost.abs();

            if new_cost.is_finite() && new_cost < end_cost {
                if lambda > lambda_floor {
                    lambda = lambda * lambda_down;
                }
                lambda = lambda.max(T::epsilon());
                cur_x.copy_from_slice(&try_x);
                problem.advance(&mut cur_x);

                // Cost reached target threshold -- terminate (respects min_iters)
                if iter >= config.min_iters && new_cost <= config.cost_threshold {
                    end_cost = new_cost;
                    done = true;
                    break;
                }

                let abs_improve = end_cost - new_cost;
                let rel_improve = if end_cost > T::epsilon() {
                    abs_improve / end_cost
                } else {
                    T::zero()
                };
                // A step is "small" if EITHER the absolute improvement is
                // negligible OR the relative improvement is negligible.
                // This way abs_precision alone can stop when cost is tiny,
                // and rel_precision alone can stop when the solver plateaus
                // at a larger cost.
                let is_small = abs_improve < config.abs_precision
                    || rel_improve < config.rel_precision;
                if iter >= config.min_iters && is_small {
                    small_count += 1;
                    if small_count >= config.patience {
                        done = true;
                    }
                } else {
                    small_count = 0;
                }
                end_cost = new_cost;
                break;
            }

            if at_precision {
                done = true;
                break;
            }

            lambda = lambda * lambda_up;

            if lambda > lambda_ceiling {
                inner = INNER_LOOPS;
                break;
            }

            inner += 1;
        }
        if inner >= INNER_LOOPS && !done {
            break;
        }
    }

    LmResult {
        x: cur_x,
        start_cost,
        end_cost,
        iterations: iter,
    }
}

// Backward-compatible wrappers

/// Solve with dense Cholesky backend (f64).
pub fn solve(x0: &[f64], problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut Dense, problem, config)
}

/// Solve with dense Cholesky backend (f32).
pub fn solve_f32(x0: &[f32], problem: &mut impl LmProblem<f32>, config: &LmConfig<f32>) -> LmResult<f32> {
    lm_solve(x0, &mut Dense, problem, config)
}

/// Solve with pure-Rust band Cholesky backend (f64).
pub fn solve_band(x0: &[f64], kd: usize, problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut Band { kd }, problem, config)
}

/// Solve with pure-Rust band Cholesky backend (f32).
pub fn solve_band_f32(x0: &[f32], kd: usize, problem: &mut impl LmProblem<f32>, config: &LmConfig<f32>) -> LmResult<f32> {
    lm_solve(x0, &mut Band { kd }, problem, config)
}

/// Solve with LAPACK band Cholesky backend (f64).
#[cfg(feature = "lapack")]
pub fn solve_band_lapack(x0: &[f64], kd: usize, problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut BandLapack { kd }, problem, config)
}

/// Solve with LAPACK band Cholesky backend (f32).
#[cfg(feature = "lapack")]
pub fn solve_band_lapack_f32(x0: &[f32], kd: usize, problem: &mut impl LmProblem<f32>, config: &LmConfig<f32>) -> LmResult<f32> {
    lm_solve(x0, &mut BandLapack { kd }, problem, config)
}

// ---------------------------------------------------------------------------
// Sparse LmSolver — COO assembly, CSC storage, dense fallback solve
// ---------------------------------------------------------------------------

/// Sparse solver using COO assembly and dense Cholesky solve (for validation).
/// Use SparseFaer for actual sparse Cholesky.
pub struct Sparse;

impl Sparse {
    pub fn new() -> Self {
        Sparse
    }
}

/// Sparse matrix for LmSolver: wraps CscMatrix with a method to create from COO.
pub struct SparseMatrix<T> {
    /// Underlying CSC storage.
    pub csc: CscMatrix<T>,
}

impl LmSolver<f64> for Sparse {
    type Matrix = SparseMatrix<f64>;

    fn new_matrix(&self, n: usize) -> SparseMatrix<f64> {
        SparseMatrix { csc: CscMatrix::empty(n) }
    }

    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], matrix: &mut SparseMatrix<f64>) {
        // We need mutable COO but only have &self here. The COO is on the solver struct
        // which we can't mutate from &self. Instead, create a temporary COO.
        let n = matrix.csc.n;
        let mut coo = CooMatrix::new(n);
        problem.calc_grad_hessian_sparse(params, grad, &mut coo);
        matrix.csc = coo.to_csc();
    }

    fn extract_diagonal(&self, matrix: &SparseMatrix<f64>, diagonal: &mut [f64]) {
        for i in 0..diagonal.len() {
            diagonal[i] = matrix.csc.vals[matrix.csc.diag_pos[i]];
        }
    }

    fn solve_damped(&mut self, n: usize, matrix: &mut SparseMatrix<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        for i in 0..n { matrix.csc.vals[matrix.csc.diag_pos[i]] = (1.0 + lambda) * diagonal[i]; }
        let mut dense = vec![0.0f64; n * n];
        for j in 0..n {
            for k in matrix.csc.col_ptr[j]..matrix.csc.col_ptr[j + 1] {
                let i = matrix.csc.row_idx[k] as usize;
                let v = matrix.csc.vals[k];
                dense[i * n + j] = v;
                if i != j { dense[j * n + i] = v; }
            }
        }
        solve_spd(n, &dense, grad, delta)
    }
}

/// Solve with COO-assembly sparse solver, dense Cholesky fallback (f64).
pub fn solve_sparse(x0: &[f64], problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut Sparse::new(), problem, config)
}

/// Sparse solver using direct CSC assembly (no COO intermediate after first iteration).
/// First iteration: COO assembly to discover pattern, build CSC structure.
/// Subsequent iterations: direct accumulate into CSC vals (with binary search lookup).
pub struct SparseDirect {
    pattern_built: bool,
}

impl SparseDirect {
    pub fn new() -> Self {
        SparseDirect { pattern_built: false }
    }
}

impl LmSolver<f64> for SparseDirect {
    type Matrix = SparseMatrix<f64>;

    fn new_matrix(&self, n: usize) -> SparseMatrix<f64> {
        SparseMatrix { csc: CscMatrix::empty(n) }
    }

    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], matrix: &mut SparseMatrix<f64>) {
        if !self.pattern_built {
            // First call: use COO to discover pattern
            let n = matrix.csc.n;
            let mut coo = CooMatrix::new(n);
            problem.calc_grad_hessian_sparse(params, grad, &mut coo);
            matrix.csc = coo.to_csc();
        } else {
            // Subsequent calls: direct accumulate into existing CSC structure
            problem.calc_grad_hessian_sparse_direct(params, grad, &mut matrix.csc);
        }
    }

    fn extract_diagonal(&self, matrix: &SparseMatrix<f64>, diagonal: &mut [f64]) {
        for i in 0..diagonal.len() {
            diagonal[i] = matrix.csc.vals[matrix.csc.diag_pos[i]];
        }
    }

    fn solve_damped(&mut self, n: usize, matrix: &mut SparseMatrix<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        for i in 0..n { matrix.csc.vals[matrix.csc.diag_pos[i]] = (1.0 + lambda) * diagonal[i]; }
        // Use dense fallback for now
        let mut dense = vec![0.0f64; n * n];
        for j in 0..n {
            for k in matrix.csc.col_ptr[j]..matrix.csc.col_ptr[j + 1] {
                let i = matrix.csc.row_idx[k] as usize;
                let v = matrix.csc.vals[k];
                dense[i * n + j] = v;
                if i != j { dense[j * n + i] = v; }
            }
        }
        solve_spd(n, &dense, grad, delta)
    }
}

/// Solve with direct CSC assembly sparse solver, dense Cholesky fallback (f64).
pub fn solve_sparse_direct(x0: &[f64], problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut SparseDirect::new(), problem, config)
}

// ---------------------------------------------------------------------------
// SparseFaer — sparse Cholesky via faer crate
// ---------------------------------------------------------------------------

/// Sparse Cholesky solver via faer crate (f64).
pub struct SparseFaer {
    symbolic: Option<faer::sparse::linalg::cholesky::SymbolicCholesky<usize>>,
    positions: std::cell::RefCell<Option<Vec<usize>>>,
}

impl SparseFaer {
    pub fn new() -> Self {
        SparseFaer { symbolic: None, positions: std::cell::RefCell::new(None) }
    }
}

impl LmSolver<f64> for SparseFaer {
    type Matrix = SparseMatrix<f64>;

    fn new_matrix(&self, n: usize) -> SparseMatrix<f64> {
        SparseMatrix { csc: CscMatrix::empty(n) }
    }

    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], matrix: &mut SparseMatrix<f64>) {
        let has_positions = self.positions.borrow().is_some();
        if has_positions {
            let positions = self.positions.borrow();
            let positions = positions.as_ref().unwrap();
            problem.calc_grad_hessian_sparse_indexed(params, grad, &mut matrix.csc.vals, positions);
        } else {
            // First call: COO assembly to discover pattern + build position map
            let n = matrix.csc.n;
            let mut coo = CooMatrix::new(n);
            problem.calc_grad_hessian_sparse(params, grad, &mut coo);
            let (csc, positions) = coo.to_csc_with_map();
            *self.positions.borrow_mut() = Some(positions);
            matrix.csc = csc;
        }
    }

    fn extract_diagonal(&self, matrix: &SparseMatrix<f64>, diagonal: &mut [f64]) {
        for i in 0..diagonal.len() {
            diagonal[i] = matrix.csc.vals[matrix.csc.diag_pos[i]];
        }
    }

    fn solve_damped(&mut self, n: usize, matrix: &mut SparseMatrix<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        for i in 0..n { matrix.csc.vals[matrix.csc.diag_pos[i]] = (1.0 + lambda) * diagonal[i]; }
        use faer::sparse::linalg::cholesky::*;

        // Convert our CSC (u32 row_idx) to faer format (usize row_idx)
        let row_idx_usize: Vec<usize> = matrix.csc.row_idx.iter().map(|&r| r as usize).collect();

        let symbolic_ref = faer::sparse::SymbolicSparseColMatRef::new_checked(
            n, n,
            &matrix.csc.col_ptr,
            None,
            &row_idx_usize,
        );
        let mat_ref = faer::sparse::SparseColMatRef::new(symbolic_ref, &matrix.csc.vals);

        // Symbolic factorization (cached across calls)
        if self.symbolic.is_none() {
            let sym = factorize_symbolic_cholesky(
                symbolic_ref,
                faer::Side::Upper,
                SymmetricOrdering::Amd,
                CholeskySymbolicParams::default(),
            );
            match sym {
                Ok(s) => self.symbolic = Some(s),
                Err(_) => return false,
            }
        }
        let symbolic = self.symbolic.as_ref().unwrap();

        // Numeric factorization
        let mut l_vals = vec![0.0f64; symbolic.len_val()];
        let scratch_size = symbolic.factorize_numeric_llt_scratch::<f64>(
            faer::Par::Seq,
            faer::Spec::default(),
        );
        let mut scratch_mem: Vec<std::mem::MaybeUninit<u8>> = vec![std::mem::MaybeUninit::uninit(); scratch_size.unaligned_bytes_required()];
        let mut stack = faer::dyn_stack::MemStack::new(&mut scratch_mem);

        let llt = symbolic.factorize_numeric_llt(
            &mut l_vals,
            mat_ref,
            faer::Side::Upper,
            faer::linalg::cholesky::llt::factor::LltRegularization::default(),
            faer::Par::Seq,
            &mut stack,
            faer::Spec::default(),
        );

        let llt = match llt {
            Ok(l) => l,
            Err(_) => return false,
        };

        // Solve
        delta.copy_from_slice(grad);
        let rhs = faer::col::ColMut::from_slice_mut(delta);
        let solve_scratch_size = symbolic.solve_in_place_scratch::<f64>(1, faer::Par::Seq);
        let mut solve_mem: Vec<std::mem::MaybeUninit<u8>> = vec![std::mem::MaybeUninit::uninit(); solve_scratch_size.unaligned_bytes_required()];
        let mut solve_stack = faer::dyn_stack::MemStack::new(&mut solve_mem);

        llt.solve_in_place_with_conj(
            faer::Conj::No,
            rhs.as_mat_mut(),
            faer::Par::Seq,
            &mut solve_stack,
        );

        true
    }
}

/// Solve with faer sparse Cholesky backend (f64).
pub fn solve_sparse_faer(x0: &[f64], problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut SparseFaer::new(), problem, config)
}

/// faer sparse Cholesky solver (f32).
pub struct SparseFaerF32 {
    symbolic: Option<faer::sparse::linalg::cholesky::SymbolicCholesky<usize>>,
    positions: std::cell::RefCell<Option<Vec<usize>>>,
}

impl SparseFaerF32 {
    pub fn new() -> Self {
        SparseFaerF32 { symbolic: None, positions: std::cell::RefCell::new(None) }
    }
}

impl LmSolver<f32> for SparseFaerF32 {
    type Matrix = SparseMatrix<f32>;

    fn new_matrix(&self, n: usize) -> SparseMatrix<f32> {
        SparseMatrix { csc: CscMatrix::empty(n) }
    }

    fn compute(&self, problem: &mut dyn LmProblem<f32>, params: &[f32], grad: &mut [f32], matrix: &mut SparseMatrix<f32>) {
        let has_positions = self.positions.borrow().is_some();
        if has_positions {
            let positions = self.positions.borrow();
            let positions = positions.as_ref().unwrap();
            problem.calc_grad_hessian_sparse_indexed(params, grad, &mut matrix.csc.vals, positions);
            return;
        }
        let n = matrix.csc.n;
        let mut coo = CooMatrix::new(n);
        problem.calc_grad_hessian_sparse(params, grad, &mut coo);
        let (csc, positions) = coo.to_csc_with_map();
        *self.positions.borrow_mut() = Some(positions);
        matrix.csc = csc;
    }

    fn extract_diagonal(&self, matrix: &SparseMatrix<f32>, diagonal: &mut [f32]) {
        for i in 0..diagonal.len() {
            diagonal[i] = matrix.csc.vals[matrix.csc.diag_pos[i]];
        }
    }

    fn solve_damped(&mut self, n: usize, matrix: &mut SparseMatrix<f32>, diagonal: &[f32], lambda: f32, grad: &[f32], delta: &mut [f32]) -> bool {
        for i in 0..n { matrix.csc.vals[matrix.csc.diag_pos[i]] = (1.0 + lambda) * diagonal[i]; }
        use faer::sparse::linalg::cholesky::*;

        let row_idx_usize: Vec<usize> = matrix.csc.row_idx.iter().map(|&r| r as usize).collect();

        let symbolic_ref = faer::sparse::SymbolicSparseColMatRef::new_checked(
            n, n,
            &matrix.csc.col_ptr,
            None,
            &row_idx_usize,
        );
        let mat_ref = faer::sparse::SparseColMatRef::new(symbolic_ref, &matrix.csc.vals);

        if self.symbolic.is_none() {
            let sym = factorize_symbolic_cholesky(
                symbolic_ref,
                faer::Side::Upper,
                SymmetricOrdering::Amd,
                CholeskySymbolicParams::default(),
            );
            match sym {
                Ok(s) => self.symbolic = Some(s),
                Err(_) => return false,
            }
        }
        let symbolic = self.symbolic.as_ref().unwrap();

        let mut l_vals = vec![0.0f32; symbolic.len_val()];
        let scratch_size = symbolic.factorize_numeric_llt_scratch::<f32>(
            faer::Par::Seq,
            faer::Spec::default(),
        );
        let mut scratch_mem: Vec<std::mem::MaybeUninit<u8>> = vec![std::mem::MaybeUninit::uninit(); scratch_size.unaligned_bytes_required()];
        let mut stack = faer::dyn_stack::MemStack::new(&mut scratch_mem);

        let llt = symbolic.factorize_numeric_llt(
            &mut l_vals,
            mat_ref,
            faer::Side::Upper,
            faer::linalg::cholesky::llt::factor::LltRegularization::default(),
            faer::Par::Seq,
            &mut stack,
            faer::Spec::default(),
        );

        let llt = match llt {
            Ok(l) => l,
            Err(_) => return false,
        };

        delta.copy_from_slice(grad);
        let rhs = faer::col::ColMut::from_slice_mut(delta);
        let solve_scratch_size = symbolic.solve_in_place_scratch::<f32>(1, faer::Par::Seq);
        let mut solve_mem: Vec<std::mem::MaybeUninit<u8>> = vec![std::mem::MaybeUninit::uninit(); solve_scratch_size.unaligned_bytes_required()];
        let mut solve_stack = faer::dyn_stack::MemStack::new(&mut solve_mem);

        llt.solve_in_place_with_conj(
            faer::Conj::No,
            rhs.as_mat_mut(),
            faer::Par::Seq,
            &mut solve_stack,
        );

        true
    }
}

/// Solve with faer sparse Cholesky backend (f32).
pub fn solve_sparse_faer_f32(x0: &[f32], problem: &mut impl LmProblem<f32>, config: &LmConfig<f32>) -> LmResult<f32> {
    lm_solve(x0, &mut SparseFaerF32::new(), problem, config)
}

// ---------------------------------------------------------------------------
// SparseSchur — Schur complement solver for pose-landmark problems
// ---------------------------------------------------------------------------
//
// Exploits the block structure of SLAM Hessians:
//   H = [ H_pp  H_pl ]   where H_ll is block-diagonal (landmarks don't
//       [ H_lp  H_ll ]   constrain each other, only connect to poses).
//
// Eliminates landmarks via Schur complement:
//   S = H_pp - H_pl * H_ll^{-1} * H_lp   (pose-only system)
//   Solve S * delta_p = g_p - H_pl * H_ll^{-1} * g_l
//   Back-substitute: delta_l = H_ll^{-1} * (g_l - H_lp * delta_p)
//
// H_ll^{-1} is trivial: just invert each independent 3x3 diagonal block.
// The reduced system S is np x np (dense), solved with nalgebra Cholesky.
//
// Constructor takes np = number of pose parameters. Poses must be the first
// np entries in the parameter vector (landmarks follow).
//
// In practice, general sparse solvers (faer, Eigen, CHOLMOD) with AMD
// ordering already eliminate landmarks first (they are leaf nodes),
// achieving the same effect without explicitly forming S. Benchmarks show
// faer beats this on typical SLAM problems. Explicit Schur might help with
// very few poses and a very large number of landmarks, where the dense
// np x np solve is cheap -- but even then general sparse solvers are
// likely superior.

/// Schur complement solver for pose-landmark SLAM problems (f64).
pub struct SparseSchur {
    np: usize,
    positions: std::cell::RefCell<Option<Vec<usize>>>,
}

impl SparseSchur {
    pub fn new(np: usize) -> Self {
        SparseSchur { np, positions: std::cell::RefCell::new(None) }
    }
}

/// Invert a 3x3 symmetric positive definite matrix via Cholesky.
/// Input: upper triangle [a00, a01, a02, a11, a12, a22].
/// Output: full 3x3 inverse in row-major order, or None if not SPD.
fn invert_spd_3x3(u: &[f64; 6]) -> Option<[f64; 9]> {
    let l00 = u[0].sqrt();
    if l00 <= 0.0 { return None; }
    let l10 = u[1] / l00;
    let l11 = (u[3] - l10 * l10).sqrt();
    if l11 <= 0.0 { return None; }
    let l20 = u[2] / l00;
    let l21 = (u[4] - l20 * l10) / l11;
    let l22 = (u[5] - l20 * l20 - l21 * l21).sqrt();
    if l22 <= 0.0 { return None; }
    let i00 = 1.0 / l00;
    let i11 = 1.0 / l11;
    let i22 = 1.0 / l22;
    let i10 = -l10 * i00 * i11;
    let i20 = -(l20 * i00 + l21 * i10) * i22;
    let i21 = -l21 * i11 * i22;
    Some([
        i00 * i00 + i10 * i10 + i20 * i20,
        i10 * i11 + i20 * i21,
        i20 * i22,
        i10 * i11 + i20 * i21,
        i11 * i11 + i21 * i21,
        i21 * i22,
        i20 * i22,
        i21 * i22,
        i22 * i22,
    ])
}

impl LmSolver<f64> for SparseSchur {
    type Matrix = SparseMatrix<f64>;

    fn new_matrix(&self, n: usize) -> SparseMatrix<f64> {
        SparseMatrix { csc: CscMatrix::empty(n) }
    }

    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], matrix: &mut SparseMatrix<f64>) {
        let has_positions = self.positions.borrow().is_some();
        if has_positions {
            let positions = self.positions.borrow();
            let positions = positions.as_ref().unwrap();
            problem.calc_grad_hessian_sparse_indexed(params, grad, &mut matrix.csc.vals, positions);
        } else {
            let n = matrix.csc.n;
            let mut coo = CooMatrix::new(n);
            problem.calc_grad_hessian_sparse(params, grad, &mut coo);
            let (csc, positions) = coo.to_csc_with_map();
            *self.positions.borrow_mut() = Some(positions);
            matrix.csc = csc;
        }
    }

    fn extract_diagonal(&self, matrix: &SparseMatrix<f64>, diagonal: &mut [f64]) {
        for i in 0..diagonal.len() {
            diagonal[i] = matrix.csc.vals[matrix.csc.diag_pos[i]];
        }
    }

    fn solve_damped(&mut self, n: usize, matrix: &mut SparseMatrix<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        let np = self.np;
        let nl = n - np;
        if nl == 0 {
            let mut dense = vec![0.0; np * np];
            let csc = &matrix.csc;
            for j in 0..np {
                for idx in csc.col_ptr[j]..csc.col_ptr[j + 1] {
                    let i = csc.row_idx[idx] as usize;
                    let v = csc.vals[idx];
                    dense[i * np + j] += v;
                    if i != j { dense[j * np + i] += v; }
                }
            }
            for i in 0..np { dense[i * np + i] = (1.0 + lambda) * diagonal[i]; }
            return solve_spd(np, &dense, grad, delta);
        }

        // Apply LM damping to full CSC diagonal
        for i in 0..n {
            matrix.csc.vals[matrix.csc.diag_pos[i]] = (1.0 + lambda) * diagonal[i];
        }
        let csc = &matrix.csc;

        // 1. Extract H_pp (dense np x np) from pose columns [0, np)
        let mut s = vec![0.0; np * np]; // S = H_pp initially
        for j in 0..np {
            for idx in csc.col_ptr[j]..csc.col_ptr[j + 1] {
                let i = csc.row_idx[idx] as usize;
                let v = csc.vals[idx];
                s[i * np + j] += v;
                if i != j { s[j * np + i] += v; }
            }
        }

        // 2. Per-landmark: extract, invert H_ll, form Schur contribution
        let n_lm = nl / 3;
        let mut rhs_p: Vec<f64> = grad[..np].to_vec();

        // Per-landmark pose connections: (pose_param_offset, C block 6x3 row-major)
        struct PoseBlock { offset: usize, c: [f64; 18] } // 6x3
        struct LmInfo { poses: Vec<PoseBlock>, b_inv: [f64; 9] }
        let mut lm_info: Vec<LmInfo> = Vec::with_capacity(n_lm);

        for k in 0..n_lm {
            let mut ll_block = [0.0f64; 6];
            // Collect raw entries, then group by pose
            let mut raw_entries: Vec<(usize, usize, f64)> = Vec::new();

            for local_col in 0..3_usize {
                let j = np + k * 3 + local_col;
                for idx in csc.col_ptr[j]..csc.col_ptr[j + 1] {
                    let i = csc.row_idx[idx] as usize;
                    let v = csc.vals[idx];
                    if i < np {
                        raw_entries.push((i, local_col, v));
                    } else {
                        let li = i - np - k * 3;
                        if li <= local_col {
                            let ti = match (li, local_col) {
                                (0, 0) => 0, (0, 1) => 1, (0, 2) => 2,
                                (1, 1) => 3, (1, 2) => 4, (2, 2) => 5,
                                _ => continue,
                            };
                            ll_block[ti] += v;
                        }
                    }
                }
            }

            let b_inv = match invert_spd_3x3(&ll_block) {
                Some(inv) => inv,
                None => return false,
            };

            // Group entries into pose blocks (6 consecutive rows = one pose)
            raw_entries.sort_by_key(|&(r, _, _)| r);
            let mut poses: Vec<PoseBlock> = Vec::new();
            let mut i = 0;
            while i < raw_entries.len() {
                let pose_base = raw_entries[i].0 / 6 * 6; // align to 6-param boundary
                let mut c = [0.0f64; 18];
                while i < raw_entries.len() && raw_entries[i].0 / 6 * 6 == pose_base {
                    let (row, col, val) = raw_entries[i];
                    let local_row = row - pose_base;
                    c[local_row * 3 + col] = val;
                    i += 1;
                }
                poses.push(PoseBlock { offset: pose_base, c });
            }

            // rhs_p -= C * b_inv * g_l
            let g_l = [grad[np + k * 3], grad[np + k * 3 + 1], grad[np + k * 3 + 2]];
            let bg = [
                b_inv[0] * g_l[0] + b_inv[1] * g_l[1] + b_inv[2] * g_l[2],
                b_inv[3] * g_l[0] + b_inv[4] * g_l[1] + b_inv[5] * g_l[2],
                b_inv[6] * g_l[0] + b_inv[7] * g_l[1] + b_inv[8] * g_l[2],
            ];
            for pb in &poses {
                for r in 0..6 {
                    rhs_p[pb.offset + r] -= pb.c[r*3] * bg[0] + pb.c[r*3+1] * bg[1] + pb.c[r*3+2] * bg[2];
                }
            }

            // S -= C * b_inv * C^T via pose-pair block operations
            // D_i = C_i * B_inv (6x3 * 3x3 = 6x3)
            let mut d_blocks: Vec<[f64; 18]> = Vec::with_capacity(poses.len());
            for pb in &poses {
                let mut d = [0.0f64; 18];
                for r in 0..6 {
                    for c2 in 0..3 {
                        d[r * 3 + c2] = pb.c[r*3] * b_inv[c2]
                                       + pb.c[r*3+1] * b_inv[3 + c2]
                                       + pb.c[r*3+2] * b_inv[6 + c2];
                    }
                }
                d_blocks.push(d);
            }

            // For each pair (i, j), S[pi, pj] -= D_i * C_j^T (6x3 * 3x6 = 6x6)
            for (ii, pi) in poses.iter().enumerate() {
                let di = &d_blocks[ii];
                for pj in poses.iter() {
                    let oi = pi.offset;
                    let oj = pj.offset;
                    for r in 0..6 {
                        for c2 in 0..6 {
                            s[(oi + r) * np + oj + c2] -=
                                di[r*3] * pj.c[c2*3] + di[r*3+1] * pj.c[c2*3+1] + di[r*3+2] * pj.c[c2*3+2];
                        }
                    }
                }
            }

            lm_info.push(LmInfo { poses, b_inv });
        }

        // 3. Solve S * delta_p = rhs_p (dense Cholesky)
        let mut delta_p = vec![0.0; np];
        if !solve_spd(np, &s, &rhs_p, &mut delta_p) {
            return false;
        }
        delta[..np].copy_from_slice(&delta_p);

        // 4. Back-substitute for landmarks
        for (k, li) in lm_info.iter().enumerate() {
            let mut g_l = [grad[np + k * 3], grad[np + k * 3 + 1], grad[np + k * 3 + 2]];
            // g_l -= C^T * delta_p
            for pb in &li.poses {
                for c2 in 0..3 {
                    for r in 0..6 {
                        g_l[c2] -= pb.c[r * 3 + c2] * delta_p[pb.offset + r];
                    }
                }
            }
            delta[np + k * 3]     = li.b_inv[0] * g_l[0] + li.b_inv[1] * g_l[1] + li.b_inv[2] * g_l[2];
            delta[np + k * 3 + 1] = li.b_inv[3] * g_l[0] + li.b_inv[4] * g_l[1] + li.b_inv[5] * g_l[2];
            delta[np + k * 3 + 2] = li.b_inv[6] * g_l[0] + li.b_inv[7] * g_l[1] + li.b_inv[8] * g_l[2];
        }

        true
    }
}

/// Solve with Schur complement backend for pose-landmark problems (f64).
pub fn solve_sparse_schur(x0: &[f64], np: usize, problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut SparseSchur::new(np), problem, config)
}

// ---------------------------------------------------------------------------
// SparseEigen / SparseCholmod — Eigen sparse Cholesky via C++ FFI
// ---------------------------------------------------------------------------

#[cfg(feature = "eigen")]
unsafe extern "C" {
    fn eigen_llt_f64_create() -> *mut std::ffi::c_void;
    fn eigen_llt_f64_destroy(handle: *mut std::ffi::c_void);
    fn eigen_llt_f64_solve(
        handle: *mut std::ffi::c_void, n: i32, nnz: i32,
        col_ptr: *const i64, row_idx: *const i32,
        vals: *const f64, rhs: *const f64, solution: *mut f64,
    ) -> bool;

    fn eigen_llt_f32_create() -> *mut std::ffi::c_void;
    fn eigen_llt_f32_destroy(handle: *mut std::ffi::c_void);
    fn eigen_llt_f32_solve(
        handle: *mut std::ffi::c_void, n: i32, nnz: i32,
        col_ptr: *const i64, row_idx: *const i32,
        vals: *const f32, rhs: *const f32, solution: *mut f32,
    ) -> bool;
}

#[cfg(feature = "cholmod")]
unsafe extern "C" {
    fn eigen_cholmod_f64_create() -> *mut std::ffi::c_void;
    fn eigen_cholmod_f64_destroy(handle: *mut std::ffi::c_void);
    fn eigen_cholmod_f64_solve(
        handle: *mut std::ffi::c_void, n: i32, nnz: i32,
        col_ptr: *const i64, row_idx: *const i32,
        vals: *const f64, rhs: *const f64, solution: *mut f64,
    ) -> bool;
}

// Helper: call an Eigen FFI solve function given our CscMatrix
#[cfg(feature = "eigen")]
fn eigen_ffi_solve<T>(
    ffi_fn: unsafe extern "C" fn(*mut std::ffi::c_void, i32, i32, *const i64, *const i32, *const T, *const T, *mut T) -> bool,
    handle: *mut std::ffi::c_void,
    csc: &CscMatrix<T>,
    grad: &[T],
    delta: &mut [T],
) -> bool {
    let n = csc.n as i32;
    let nnz = csc.vals.len() as i32;
    // col_ptr: &[usize] -> *const i64 (usize == u64 on 64-bit, same layout as i64)
    let col_ptr = csc.col_ptr.as_ptr() as *const i64;
    // row_idx: &[u32] -> *const i32 (same layout)
    let row_idx = csc.row_idx.as_ptr() as *const i32;
    unsafe {
        ffi_fn(handle, n, nnz, col_ptr, row_idx,
            csc.vals.as_ptr(), grad.as_ptr(), delta.as_mut_ptr())
    }
}

/// Eigen SimplicialLLT sparse Cholesky solver (f64).
#[cfg(feature = "eigen")]
pub struct SparseEigen {
    handle: *mut std::ffi::c_void,
    positions: std::cell::RefCell<Option<Vec<usize>>>,
}

#[cfg(feature = "eigen")]
impl SparseEigen {
    pub fn new() -> Self {
        SparseEigen { handle: unsafe { eigen_llt_f64_create() }, positions: std::cell::RefCell::new(None) }
    }
}

#[cfg(feature = "eigen")]
impl Drop for SparseEigen {
    fn drop(&mut self) { unsafe { eigen_llt_f64_destroy(self.handle); } }
}

#[cfg(feature = "eigen")]
impl LmSolver<f64> for SparseEigen {
    type Matrix = SparseMatrix<f64>;

    fn new_matrix(&self, n: usize) -> SparseMatrix<f64> {
        SparseMatrix { csc: CscMatrix::empty(n) }
    }
    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], matrix: &mut SparseMatrix<f64>) {
        let has_positions = self.positions.borrow().is_some();
        if has_positions {
            let positions = self.positions.borrow();
            problem.calc_grad_hessian_sparse_indexed(params, grad, &mut matrix.csc.vals, positions.as_ref().unwrap());
        } else {
            let n = matrix.csc.n;
            let mut coo = CooMatrix::new(n);
            problem.calc_grad_hessian_sparse(params, grad, &mut coo);
            let (csc, positions) = coo.to_csc_with_map();
            *self.positions.borrow_mut() = Some(positions);
            matrix.csc = csc;
        }
    }
    fn extract_diagonal(&self, matrix: &SparseMatrix<f64>, diagonal: &mut [f64]) {
        for i in 0..diagonal.len() { diagonal[i] = matrix.csc.vals[matrix.csc.diag_pos[i]]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut SparseMatrix<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        for i in 0..n { matrix.csc.vals[matrix.csc.diag_pos[i]] = (1.0 + lambda) * diagonal[i]; }
        eigen_ffi_solve(eigen_llt_f64_solve, self.handle, &matrix.csc, grad, delta)
    }
}

/// Eigen SimplicialLLT sparse Cholesky solver (f32).
#[cfg(feature = "eigen")]
pub struct SparseEigenF32 {
    handle: *mut std::ffi::c_void,
    positions: std::cell::RefCell<Option<Vec<usize>>>,
}

#[cfg(feature = "eigen")]
impl SparseEigenF32 {
    pub fn new() -> Self {
        SparseEigenF32 { handle: unsafe { eigen_llt_f32_create() }, positions: std::cell::RefCell::new(None) }
    }
}

#[cfg(feature = "eigen")]
impl Drop for SparseEigenF32 {
    fn drop(&mut self) { unsafe { eigen_llt_f32_destroy(self.handle); } }
}

#[cfg(feature = "eigen")]
impl LmSolver<f32> for SparseEigenF32 {
    type Matrix = SparseMatrix<f32>;

    fn new_matrix(&self, n: usize) -> SparseMatrix<f32> {
        SparseMatrix { csc: CscMatrix::empty(n) }
    }
    fn compute(&self, problem: &mut dyn LmProblem<f32>, params: &[f32], grad: &mut [f32], matrix: &mut SparseMatrix<f32>) {
        let has_positions = self.positions.borrow().is_some();
        if has_positions {
            let positions = self.positions.borrow();
            problem.calc_grad_hessian_sparse_indexed(params, grad, &mut matrix.csc.vals, positions.as_ref().unwrap());
        } else {
            let n = matrix.csc.n;
            let mut coo = CooMatrix::new(n);
            problem.calc_grad_hessian_sparse(params, grad, &mut coo);
            let (csc, positions) = coo.to_csc_with_map();
            *self.positions.borrow_mut() = Some(positions);
            matrix.csc = csc;
        }
    }
    fn extract_diagonal(&self, matrix: &SparseMatrix<f32>, diagonal: &mut [f32]) {
        for i in 0..diagonal.len() { diagonal[i] = matrix.csc.vals[matrix.csc.diag_pos[i]]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut SparseMatrix<f32>, diagonal: &[f32], lambda: f32, grad: &[f32], delta: &mut [f32]) -> bool {
        for i in 0..n { matrix.csc.vals[matrix.csc.diag_pos[i]] = (1.0 + lambda) * diagonal[i]; }
        eigen_ffi_solve(eigen_llt_f32_solve, self.handle, &matrix.csc, grad, delta)
    }
}

/// Eigen CholmodSimplicialLLT sparse Cholesky solver (f64 only).
#[cfg(feature = "cholmod")]
pub struct SparseCholmod {
    handle: *mut std::ffi::c_void,
    positions: std::cell::RefCell<Option<Vec<usize>>>,
}

#[cfg(feature = "cholmod")]
impl SparseCholmod {
    pub fn new() -> Self {
        SparseCholmod { handle: unsafe { eigen_cholmod_f64_create() }, positions: std::cell::RefCell::new(None) }
    }
}

#[cfg(feature = "cholmod")]
impl Drop for SparseCholmod {
    fn drop(&mut self) { unsafe { eigen_cholmod_f64_destroy(self.handle); } }
}

#[cfg(feature = "cholmod")]
impl LmSolver<f64> for SparseCholmod {
    type Matrix = SparseMatrix<f64>;

    fn new_matrix(&self, n: usize) -> SparseMatrix<f64> {
        SparseMatrix { csc: CscMatrix::empty(n) }
    }
    fn compute(&self, problem: &mut dyn LmProblem<f64>, params: &[f64], grad: &mut [f64], matrix: &mut SparseMatrix<f64>) {
        let has_positions = self.positions.borrow().is_some();
        if has_positions {
            let positions = self.positions.borrow();
            problem.calc_grad_hessian_sparse_indexed(params, grad, &mut matrix.csc.vals, positions.as_ref().unwrap());
        } else {
            let n = matrix.csc.n;
            let mut coo = CooMatrix::new(n);
            problem.calc_grad_hessian_sparse(params, grad, &mut coo);
            let (csc, positions) = coo.to_csc_with_map();
            *self.positions.borrow_mut() = Some(positions);
            matrix.csc = csc;
        }
    }
    fn extract_diagonal(&self, matrix: &SparseMatrix<f64>, diagonal: &mut [f64]) {
        for i in 0..diagonal.len() { diagonal[i] = matrix.csc.vals[matrix.csc.diag_pos[i]]; }
    }
    fn solve_damped(&mut self, n: usize, matrix: &mut SparseMatrix<f64>, diagonal: &[f64], lambda: f64, grad: &[f64], delta: &mut [f64]) -> bool {
        for i in 0..n { matrix.csc.vals[matrix.csc.diag_pos[i]] = (1.0 + lambda) * diagonal[i]; }
        eigen_ffi_solve(eigen_cholmod_f64_solve, self.handle, &matrix.csc, grad, delta)
    }
}

/// Solve with Eigen SimplicialLLT sparse Cholesky backend (f64).
#[cfg(feature = "eigen")]
pub fn solve_sparse_eigen(x0: &[f64], problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut SparseEigen::new(), problem, config)
}

/// Solve with Eigen SimplicialLLT sparse Cholesky backend (f32).
#[cfg(feature = "eigen")]
pub fn solve_sparse_eigen_f32(x0: &[f32], problem: &mut impl LmProblem<f32>, config: &LmConfig<f32>) -> LmResult<f32> {
    lm_solve(x0, &mut SparseEigenF32::new(), problem, config)
}

/// Solve with CHOLMOD sparse Cholesky backend (f64).
#[cfg(feature = "cholmod")]
pub fn solve_sparse_cholmod(x0: &[f64], problem: &mut impl LmProblem<f64>, config: &LmConfig<f64>) -> LmResult<f64> {
    lm_solve(x0, &mut SparseCholmod::new(), problem, config)
}

// ---------------------------------------------------------------------------
// Sparse matrix types — COO (triplet) and CSC (compressed sparse column)
// ---------------------------------------------------------------------------

/// COO (coordinate/triplet) format for incremental assembly.
/// Upper triangle only (row <= col) for symmetric matrices.
pub struct CooMatrix<T> {
    /// Matrix dimension (n-by-n).
    pub n: usize,
    /// Row indices of stored triplets.
    pub rows: Vec<u32>,
    /// Column indices of stored triplets.
    pub cols: Vec<u32>,
    /// Values of stored triplets.
    pub vals: Vec<T>,
}

impl<T: Float> CooMatrix<T> {
    /// Create an empty COO matrix for an n-by-n system.
    pub fn new(n: usize) -> Self {
        CooMatrix { n, rows: Vec::new(), cols: Vec::new(), vals: Vec::new() }
    }

    /// Remove all stored triplets, keeping allocated capacity.
    pub fn clear(&mut self) {
        self.rows.clear();
        self.cols.clear();
        self.vals.clear();
    }

    /// Append a triplet (row, col, val). Duplicates are summed during CSC conversion.
    pub fn push(&mut self, row: u32, col: u32, val: T) {
        self.rows.push(row);
        self.cols.push(col);
        self.vals.push(val);
    }

    /// Number of stored triplets (including duplicates).
    pub fn nnz(&self) -> usize { self.rows.len() }

    /// Convert to CSC format. Duplicate entries are summed.
    /// Returns upper-triangle CSC with cached diagonal positions.
    pub fn to_csc(&self) -> CscMatrix<T> {
        let n = self.n;
        let nnz_raw = self.nnz();

        // Sort by (col, row)
        let mut order: Vec<usize> = (0..nnz_raw).collect();
        order.sort_by(|&a, &b| {
            self.cols[a].cmp(&self.cols[b]).then(self.rows[a].cmp(&self.rows[b]))
        });

        // Compress: merge duplicates, build col_ptr + row_idx + vals
        let mut col_ptr = vec![0usize; n + 1];
        let mut row_idx = Vec::with_capacity(nnz_raw);
        let mut vals = Vec::with_capacity(nnz_raw);

        let mut prev_col = u32::MAX;
        let mut prev_row = u32::MAX;

        for &idx in &order {
            let r = self.rows[idx];
            let c = self.cols[idx];
            let v = self.vals[idx];

            if c == prev_col && r == prev_row {
                // Duplicate: sum into last entry
                *vals.last_mut().unwrap() = *vals.last().unwrap() + v;
            } else {
                row_idx.push(r);
                vals.push(v);
                col_ptr[c as usize + 1] += 1;
                prev_col = c;
                prev_row = r;
            }
        }

        // Prefix sum for col_ptr
        for j in 1..=n {
            col_ptr[j] += col_ptr[j - 1];
        }

        // Find diagonal positions
        let mut diag_pos = vec![0usize; n];
        for j in 0..n {
            for k in col_ptr[j]..col_ptr[j + 1] {
                if row_idx[k] as usize == j {
                    diag_pos[j] = k;
                    break;
                }
            }
        }

        CscMatrix { n, col_ptr, row_idx, vals, diag_pos }
    }

    /// Scatter COO values into an existing CSC with matching structure.
    /// The CSC must have been created from a previous COO with the same pattern.
    /// `coo_to_csc` maps COO entry index -> CSC vals index (with duplicate summing).
    pub fn scatter_into(&self, csc: &mut CscMatrix<T>, coo_to_csc: &[usize]) {
        csc.vals.iter_mut().for_each(|v| *v = T::zero());
        for (i, &csc_idx) in coo_to_csc.iter().enumerate() {
            csc.vals[csc_idx] = csc.vals[csc_idx] + self.vals[i];
        }
    }

    /// Build the COO-to-CSC mapping for scatter_into.
    /// Must be called after to_csc() on the same COO pattern.
    pub fn build_scatter_map(&self, csc: &CscMatrix<T>) -> Vec<usize> {
        let nnz_raw = self.nnz();
        let mut map = vec![0usize; nnz_raw];

        // Sort by (col, row) -- same order as to_csc
        let mut order: Vec<usize> = (0..nnz_raw).collect();
        order.sort_by(|&a, &b| {
            self.cols[a].cmp(&self.cols[b]).then(self.rows[a].cmp(&self.rows[b]))
        });

        let mut csc_idx = 0usize;
        let mut prev_col = u32::MAX;
        let mut prev_row = u32::MAX;

        for &orig_idx in &order {
            let r = self.rows[orig_idx];
            let c = self.cols[orig_idx];
            if c == prev_col && r == prev_row {
                // Duplicate maps to same CSC position
                map[orig_idx] = csc_idx - 1;
            } else {
                map[orig_idx] = csc_idx;
                csc_idx += 1;
                prev_col = c;
                prev_row = r;
            }
        }
        // Verify we consumed all CSC entries
        debug_assert_eq!(csc_idx, csc.vals.len());

        map
    }

    /// Convert to CSC and build scatter map in one pass using counting sort.
    /// Returns (CscMatrix, positions) where positions maps each COO entry to
    /// its CSC vals index for use with calc_grad_hessian_sparse_indexed.
    pub fn to_csc_with_map(&self) -> (CscMatrix<T>, Vec<usize>) {
        let n = self.n;
        let nnz_raw = self.nnz();

        // Pass 1: count entries per column
        let mut col_count = vec![0usize; n];
        for i in 0..nnz_raw {
            col_count[self.cols[i] as usize] += 1;
        }

        // Build col_ptr from counts
        let mut col_ptr = vec![0usize; n + 1];
        for j in 0..n {
            col_ptr[j + 1] = col_ptr[j] + col_count[j];
        }

        // Pass 2: place entries into column buckets (counting sort)
        // order[k] = original COO index, sorted by column
        let mut order = vec![0usize; nnz_raw];
        let mut col_pos = col_ptr[..n].to_vec(); // write cursors per column
        for i in 0..nnz_raw {
            let c = self.cols[i] as usize;
            order[col_pos[c]] = i;
            col_pos[c] += 1;
        }

        // Pass 3: within each column, sort by row (small per-column sorts)
        for j in 0..n {
            let start = col_ptr[j];
            let end = col_ptr[j + 1];
            order[start..end].sort_by_key(|&idx| self.rows[idx]);
        }

        // Pass 4: compress (merge duplicates), build row_idx + vals + map
        let mut row_idx = Vec::with_capacity(nnz_raw);
        let mut vals = Vec::with_capacity(nnz_raw);
        let mut map = vec![0usize; nnz_raw];
        let mut new_col_ptr = vec![0usize; n + 1];

        let mut prev_col = u32::MAX;
        let mut prev_row = u32::MAX;
        let mut csc_idx = 0usize;

        for &orig_idx in &order {
            let r = self.rows[orig_idx];
            let c = self.cols[orig_idx];
            let v = self.vals[orig_idx];

            if c == prev_col && r == prev_row {
                // Duplicate: sum into last entry
                *vals.last_mut().unwrap() = *vals.last().unwrap() + v;
                map[orig_idx] = csc_idx - 1;
            } else {
                row_idx.push(r);
                vals.push(v);
                new_col_ptr[c as usize + 1] += 1;
                map[orig_idx] = csc_idx;
                csc_idx += 1;
                prev_col = c;
                prev_row = r;
            }
        }

        // Prefix sum for col_ptr
        for j in 1..=n {
            new_col_ptr[j] += new_col_ptr[j - 1];
        }

        // Find diagonal positions
        let mut diag_pos = vec![0usize; n];
        for j in 0..n {
            for k in new_col_ptr[j]..new_col_ptr[j + 1] {
                if row_idx[k] as usize == j {
                    diag_pos[j] = k;
                    break;
                }
            }
        }

        (CscMatrix { n, col_ptr: new_col_ptr, row_idx, vals, diag_pos }, map)
    }
}

/// CSC (Compressed Sparse Column) format for solving.
/// Upper triangle only for symmetric SPD matrices.
pub struct CscMatrix<T> {
    /// Matrix dimension (n-by-n).
    pub n: usize,
    /// Column pointers, length n+1.
    pub col_ptr: Vec<usize>,
    /// Row indices for each nonzero, length nnz.
    pub row_idx: Vec<u32>,
    /// Nonzero values, length nnz.
    pub vals: Vec<T>,
    /// Position of the diagonal element in vals, per column.
    pub diag_pos: Vec<usize>,
}

impl<T: Float> CscMatrix<T> {
    /// Create an empty CSC matrix (no nonzeros) for an n-by-n system.
    pub fn empty(n: usize) -> Self {
        CscMatrix {
            n,
            col_ptr: vec![0; n + 1],
            row_idx: Vec::new(),
            vals: Vec::new(),
            diag_pos: vec![0; n],
        }
    }

    /// Number of stored nonzero entries.
    pub fn nnz(&self) -> usize { self.vals.len() }

    /// Look up the position in vals for element (row, col) where row <= col.
    /// Returns None if the element is not in the sparsity pattern.
    pub fn find_pos(&self, row: usize, col: usize) -> Option<usize> {
        let start = self.col_ptr[col];
        let end = self.col_ptr[col + 1];
        let row_u32 = row as u32;
        // Binary search in the row indices for this column
        self.row_idx[start..end]
            .binary_search(&row_u32)
            .ok()
            .map(|offset| start + offset)
    }

    /// Dense matrix-vector product y = A*x (using upper triangle, symmetric).
    /// For testing/verification only.
    pub fn symv(&self, x: &[T], y: &mut [T]) {
        for v in y.iter_mut() { *v = T::zero(); }
        for j in 0..self.n {
            for k in self.col_ptr[j]..self.col_ptr[j + 1] {
                let i = self.row_idx[k] as usize;
                let a = self.vals[k];
                y[i] = y[i] + a * x[j];
                if i != j {
                    y[j] = y[j] + a * x[i];
                }
            }
        }
    }
}

// ---------------------------------------------------------------------------
// Band Cholesky solve — LAPACK dpbsv/spbsv (optional) or pure Rust
// ---------------------------------------------------------------------------

/// Band Cholesky solve via LAPACK dpbsv (f64). Band is modified in-place.
#[cfg(feature = "lapack")]
pub fn solve_spd_band_lapack(n: usize, kd: usize, band: &mut [f64], b: &mut [f64]) -> bool {
    let mut info: i32 = 0;
    let ldab = (kd + 1) as i32;
    unsafe {
        lapack_sys::dpbsv_(
            &b'U',  // upper band
            &(n as i32),
            &(kd as i32),
            &1i32,           // nrhs = 1
            band.as_mut_ptr(),
            &ldab,
            b.as_mut_ptr(),
            &(n as i32),
            &mut info,
        );
    }
    info == 0
}

/// Band Cholesky solve via LAPACK spbsv (f32). Band is modified in-place.
#[cfg(feature = "lapack")]
pub fn solve_spd_band_lapack_f32(n: usize, kd: usize, band: &mut [f32], b: &mut [f32]) -> bool {
    let mut info: i32 = 0;
    let ldab = (kd + 1) as i32;
    unsafe {
        lapack_sys::spbsv_(
            &b'U',
            &(n as i32),
            &(kd as i32),
            &1i32,
            band.as_mut_ptr(),
            &ldab,
            b.as_mut_ptr(),
            &(n as i32),
            &mut info,
        );
    }
    info == 0
}

// ---------------------------------------------------------------------------
// Band Cholesky solve — pure Rust, O(n*kd^2)
// ---------------------------------------------------------------------------

/// Band Cholesky solve for SPD matrix in upper-band format (column-major).
/// `band` has size `(kd+1)*n`. Element `A[i,j]` (i<=j, j-i<=kd) is at
/// `band[(kd+i-j) + j*(kd+1)]`. The band array is modified in-place
/// (factorized). `b` is modified in-place with the solution.
/// Returns false if A is not positive definite.
pub fn solve_spd_band(n: usize, kd: usize, band: &mut [f64], b: &mut [f64]) -> bool {
    let ldab = kd + 1;

    // Cholesky factorize in-place: A = R^T * R (upper triangular R in band)
    for j in 0..n {
        // Diagonal: R[j,j]^2 = A[j,j] - sum_{k} R[k,j]^2
        let mut s = band[kd + j * ldab]; // A[j,j]
        let k_start = if j > kd { j - kd } else { 0 };
        for k in k_start..j {
            let rkj = band[(kd + k - j) + j * ldab]; // R[k,j]
            s -= rkj * rkj;
        }
        if s <= 0.0 { return false; }
        let rjj = s.sqrt();
        band[kd + j * ldab] = rjj;

        // Off-diagonal: R[j,i] for i = j+1..min(n, j+kd+1)
        let i_end = (j + kd + 1).min(n);
        for i in (j + 1)..i_end {
            let mut s = band[(kd + j - i) + i * ldab]; // A[j,i]
            let k_start2 = if i > kd { i - kd } else { 0 };
            let k_lo = k_start.max(k_start2);
            for k in k_lo..j {
                let rkj = band[(kd + k - j) + j * ldab];
                let rki = band[(kd + k - i) + i * ldab];
                s -= rkj * rki;
            }
            band[(kd + j - i) + i * ldab] = s / rjj;
        }
    }

    // Forward substitution: R^T * y = b
    for i in 0..n {
        let mut s = b[i];
        let k_start = if i > kd { i - kd } else { 0 };
        for k in k_start..i {
            s -= band[(kd + k - i) + i * ldab] * b[k]; // R[k,i]
        }
        b[i] = s / band[kd + i * ldab]; // R[i,i]
    }

    // Back substitution: R * x = y
    for i in (0..n).rev() {
        let mut s = b[i];
        let k_end = (i + kd + 1).min(n);
        for k in (i + 1)..k_end {
            s -= band[(kd + i - k) + k * ldab] * b[k]; // R[i,k]
        }
        b[i] = s / band[kd + i * ldab]; // R[i,i]
    }

    true
}

/// Band Cholesky solve for SPD matrix in upper-band format (f32).
/// Same semantics as [`solve_spd_band`] but for single precision.
pub fn solve_spd_band_f32(n: usize, kd: usize, band: &mut [f32], b: &mut [f32]) -> bool {
    let ldab = kd + 1;

    for j in 0..n {
        let mut s = band[kd + j * ldab];
        let k_start = if j > kd { j - kd } else { 0 };
        for k in k_start..j {
            let rkj = band[(kd + k - j) + j * ldab];
            s -= rkj * rkj;
        }
        if s <= 0.0 { return false; }
        let rjj = s.sqrt();
        band[kd + j * ldab] = rjj;

        let i_end = (j + kd + 1).min(n);
        for i in (j + 1)..i_end {
            let mut s = band[(kd + j - i) + i * ldab];
            let k_start2 = if i > kd { i - kd } else { 0 };
            let k_lo = k_start.max(k_start2);
            for k in k_lo..j {
                let rkj = band[(kd + k - j) + j * ldab];
                let rki = band[(kd + k - i) + i * ldab];
                s -= rkj * rki;
            }
            band[(kd + j - i) + i * ldab] = s / rjj;
        }
    }

    for i in 0..n {
        let mut s = b[i];
        let k_start = if i > kd { i - kd } else { 0 };
        for k in k_start..i {
            s -= band[(kd + k - i) + i * ldab] * b[k];
        }
        b[i] = s / band[kd + i * ldab];
    }

    for i in (0..n).rev() {
        let mut s = b[i];
        let k_end = (i + kd + 1).min(n);
        for k in (i + 1)..k_end {
            s -= band[(kd + i - k) + k * ldab] * b[k];
        }
        b[i] = s / band[kd + i * ldab];
    }

    true
}

// ---------------------------------------------------------------------------
// (Old band LM solver functions removed -- now handled by lm_solve + LmSolver trait)
// ---------------------------------------------------------------------------

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn cholesky_solve_2x2() {
        // Solve [[4, 2], [2, 3]] * x = [1, 2]
        // Solution: x = [-0.125, 0.75]
        let a = [4.0, 2.0, 2.0, 3.0];
        let b = [1.0, 2.0];
        let mut x = [0.0; 2];
        assert!(solve_spd(2, &a, &b, &mut x));
        assert!((x[0] - (-0.125)).abs() < 1e-10, "x[0]={}", x[0]);
        assert!((x[1] - 0.75).abs() < 1e-10, "x[1]={}", x[1]);
    }

    #[test]
    fn cholesky_solve_4x4() {
        // 4x4 SPD matrix: A = M^T*M + 4*I
        let m = [1.0, 0.5, 0.2, 0.1,
                  0.3, 1.2, 0.4, 0.6,
                  0.7, 0.1, 0.9, 0.3,
                  0.2, 0.8, 0.5, 1.1];
        let mut a = [0.0f64; 16];
        for i in 0..4 {
            for j in 0..4 {
                let mut sum = 0.0;
                for k in 0..4 { sum += m[k * 4 + i] * m[k * 4 + j]; }
                a[i * 4 + j] = sum;
            }
            a[i * 4 + i] += 4.0;
        }
        let b = [1.0, 2.0, 3.0, 4.0];
        let mut x = [0.0f64; 4];
        assert!(solve_spd(4, &a, &b, &mut x));
        // Verify Ax ≈ b
        for i in 0..4 {
            let mut ax_i = 0.0;
            for j in 0..4 { ax_i += a[i * 4 + j] * x[j]; }
            assert!((ax_i - b[i]).abs() < 1e-10, "row {}: Ax={}, b={}", i, ax_i, b[i]);
        }
    }

    #[test]
    fn solve_quadratic() {
        // Minimize f(x,y) = (x-3)^2 + (y-7)^2
        let mut p = FnProblem {
            cost: |x: &[f64]| (x[0] - 3.0) * (x[0] - 3.0) + (x[1] - 7.0) * (x[1] - 7.0),
            grad_hessian: |x: &[f64], grad: &mut [f64], hess: &mut [f64]| {
                grad[0] = 2.0 * (x[0] - 3.0);
                grad[1] = 2.0 * (x[1] - 7.0);
                hess[0] = 2.0; hess[1] = 0.0;
                hess[2] = 0.0; hess[3] = 2.0;
            },
        };
        let result = solve(
            &[0.0, 0.0],
            &mut p,
            &LmConfig { abs_precision: 1e-10, max_iters: 100, initial_lambda: 0.001, ..Default::default() },
        );

        assert!((result.x[0] - 3.0).abs() < 1e-6, "x={}", result.x[0]);
        assert!((result.x[1] - 7.0).abs() < 1e-6, "y={}", result.x[1]);
        assert!(result.end_cost < 1e-10);
    }

    #[test]
    fn solve_rosenbrock() {
        // Rosenbrock: f(x,y) = (1-x)^2 + 100*(y-x^2)^2, minimum at (1, 1)
        let mut p = FnProblem {
            cost: |p: &[f64]| {
                let (x, y) = (p[0], p[1]);
                (1.0 - x).powi(2) + 100.0 * (y - x * x).powi(2)
            },
            grad_hessian: |p: &[f64], grad: &mut [f64], hess: &mut [f64]| {
                let (x, y) = (p[0], p[1]);
                grad[0] = -2.0 * (1.0 - x) - 400.0 * x * (y - x * x);
                grad[1] = 200.0 * (y - x * x);
                hess[0] = 2.0 + 1200.0 * x * x - 400.0 * y;
                hess[1] = -400.0 * x;
                hess[2] = -400.0 * x;
                hess[3] = 200.0;
            },
        };
        let result = solve(
            &[-1.0, 1.0],
            &mut p,
            &LmConfig { abs_precision: 1e-12, max_iters: 500, initial_lambda: 0.001, ..Default::default() },
        );

        assert!((result.x[0] - 1.0).abs() < 1e-4, "x={}", result.x[0]);
        assert!((result.x[1] - 1.0).abs() < 1e-4, "y={}", result.x[1]);
    }

    #[test]
    fn solve_f32_quadratic() {
        let mut p = FnProblem {
            cost: |x: &[f32]| (x[0] - 3.0) * (x[0] - 3.0) + (x[1] - 7.0) * (x[1] - 7.0),
            grad_hessian: |x: &[f32], grad: &mut [f32], hess: &mut [f32]| {
                grad[0] = 2.0 * (x[0] - 3.0);
                grad[1] = 2.0 * (x[1] - 7.0);
                hess[0] = 2.0; hess[1] = 0.0;
                hess[2] = 0.0; hess[3] = 2.0;
            },
        };
        let result = solve_f32(
            &[0.0f32, 0.0],
            &mut p,
            &LmConfig { abs_precision: 1e-4, max_iters: 100, initial_lambda: 0.001, ..Default::default() },
        );

        assert!((result.x[0] - 3.0).abs() < 1e-3, "x={}", result.x[0]);
        assert!((result.x[1] - 7.0).abs() < 1e-3, "y={}", result.x[1]);
    }

    #[test]
    fn band_cholesky_2x2() {
        // Same as cholesky_solve_2x2: [[4, 2], [2, 3]] * x = [1, 2]
        // kd=1, upper band format (column-major):
        //   col 0: [0, 4]  (superdiag=0, diag=4)
        //   col 1: [2, 3]  (superdiag=2, diag=3)
        let mut band = vec![0.0, 4.0, 2.0, 3.0]; // (kd+1)*n = 2*2 = 4
        let mut b = vec![1.0, 2.0];
        assert!(solve_spd_band(2, 1, &mut band, &mut b));
        assert!((b[0] - (-0.125)).abs() < 1e-10, "x[0]={}", b[0]);
        assert!((b[1] - 0.75).abs() < 1e-10, "x[1]={}", b[1]);
    }

    #[test]
    fn band_cholesky_tridiag_4x4() {
        // Tridiagonal SPD:
        // [4 1 0 0]   [1]
        // [1 4 1 0] * [2]
        // [0 1 4 1]   [3]
        // [0 0 1 4]   [4]
        // kd=1, upper band (column-major):
        let mut band = vec![
            0.0, 4.0,  // col 0: superdiag=0, diag=4
            1.0, 4.0,  // col 1: superdiag=1, diag=4
            1.0, 4.0,  // col 2
            1.0, 4.0,  // col 3
        ];
        let mut b = vec![1.0, 2.0, 3.0, 4.0];
        assert!(solve_spd_band(4, 1, &mut band, &mut b));

        // Verify against dense solve
        let a = [4.0, 1.0, 0.0, 0.0,
                  1.0, 4.0, 1.0, 0.0,
                  0.0, 1.0, 4.0, 1.0,
                  0.0, 0.0, 1.0, 4.0];
        let rhs = [1.0, 2.0, 3.0, 4.0];
        let mut x_dense = [0.0; 4];
        assert!(solve_spd(4, &a, &rhs, &mut x_dense));

        for i in 0..4 {
            assert!((b[i] - x_dense[i]).abs() < 1e-10,
                "i={}: band={}, dense={}", i, b[i], x_dense[i]);
        }
    }

    #[test]
    fn band_cholesky_block_tridiag_6x6() {
        // Block-tridiagonal with 2x2 blocks (3 blocks), kd=3
        // Build a random SPD band matrix: A = B^T*B + 10*I
        let n = 6;
        let kd = 3;
        let ldab = kd + 1;

        // Start with identity * 10
        let mut dense = vec![0.0f64; n * n];
        for i in 0..n { dense[i * n + i] = 10.0; }
        // Add some off-diagonal within bandwidth
        let offdiag = [
            (0,1, 1.0), (0,2, 0.5), (0,3, 0.2),
            (1,2, 1.5), (1,3, 0.8), (1,4, 0.3),
            (2,3, 2.0), (2,4, 1.0), (2,5, 0.4),
            (3,4, 1.2), (3,5, 0.6),
            (4,5, 1.8),
        ];
        for &(i, j, v) in &offdiag {
            dense[i * n + j] += v;
            dense[j * n + i] += v;
        }

        // Convert to band format
        let mut band = vec![0.0f64; ldab * n];
        for j in 0..n {
            let i_start = if j > kd { j - kd } else { 0 };
            for i in i_start..=j {
                band[(kd + i - j) + j * ldab] = dense[i * n + j];
            }
        }

        let rhs = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];

        // Solve with band
        let mut band_copy = band.clone();
        let mut b_band = rhs.clone();
        assert!(solve_spd_band(n, kd, &mut band_copy, &mut b_band));

        // Solve with dense
        let mut x_dense = vec![0.0; n];
        assert!(solve_spd(n, &dense, &rhs, &mut x_dense));

        for i in 0..n {
            assert!((b_band[i] - x_dense[i]).abs() < 1e-8,
                "i={}: band={}, dense={}", i, b_band[i], x_dense[i]);
        }
    }

    #[test]
    #[cfg(feature = "lapack")]
    fn band_cholesky_lapack_vs_rust() {
        // Compare LAPACK dpbsv with our pure Rust band Cholesky
        let n = 6;
        let kd = 3;
        let ldab = kd + 1;

        let mut dense = vec![0.0f64; n * n];
        for i in 0..n { dense[i * n + i] = 10.0; }
        let offdiag = [
            (0,1, 1.0), (0,2, 0.5), (0,3, 0.2),
            (1,2, 1.5), (1,3, 0.8), (1,4, 0.3),
            (2,3, 2.0), (2,4, 1.0), (2,5, 0.4),
            (3,4, 1.2), (3,5, 0.6),
            (4,5, 1.8),
        ];
        for &(i, j, v) in &offdiag {
            dense[i * n + j] += v;
            dense[j * n + i] += v;
        }

        let mut band = vec![0.0f64; ldab * n];
        for j in 0..n {
            let i_start = if j > kd { j - kd } else { 0 };
            for i in i_start..=j {
                band[(kd + i - j) + j * ldab] = dense[i * n + j];
            }
        }
        let rhs = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];

        // Solve with pure Rust
        let mut band_rust = band.clone();
        let mut b_rust = rhs.clone();
        assert!(solve_spd_band(n, kd, &mut band_rust, &mut b_rust));

        // Solve with LAPACK
        let mut band_lapack = band.clone();
        let mut b_lapack = rhs.clone();
        assert!(solve_spd_band_lapack(n, kd, &mut band_lapack, &mut b_lapack));

        for i in 0..n {
            assert!((b_rust[i] - b_lapack[i]).abs() < 1e-12,
                "i={}: rust={}, lapack={}", i, b_rust[i], b_lapack[i]);
        }
    }

    /// Helper: build a random SPD band matrix and convert to band + dense format.
    /// Returns (dense [n*n row-major], band [(kd+1)*n col-major], rhs [n]).
    fn random_spd_band(n: usize, kd: usize, seed: u64) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
        use rand::prelude::*;
        use rand::rngs::StdRng;
        let mut rng = StdRng::seed_from_u64(seed);
        let ldab = kd + 1;

        // Build A = B^T * B + diag_weight*I where B is random band
        let mut dense = vec![0.0f64; n * n];
        // Add random band entries to make B, then compute A = B^T*B
        let mut b_mat = vec![0.0f64; n * n];
        for j in 0..n {
            let i_start = if j > kd { j - kd } else { 0 };
            for i in i_start..=j {
                b_mat[i * n + j] = rng.random::<f64>() * 2.0 - 1.0;
            }
        }
        // A = B^T * B
        for i in 0..n {
            for j in i..n {
                if j - i > kd { continue; }
                let mut sum = 0.0;
                for k in 0..n {
                    sum += b_mat[k * n + i] * b_mat[k * n + j];
                }
                dense[i * n + j] = sum;
                dense[j * n + i] = sum;
            }
        }
        // Strengthen diagonal to ensure well-conditioned
        for i in 0..n {
            dense[i * n + i] += n as f64;
        }

        // Convert to band format
        let mut band = vec![0.0f64; ldab * n];
        for j in 0..n {
            let i_start = if j > kd { j - kd } else { 0 };
            for i in i_start..=j {
                band[(kd + i - j) + j * ldab] = dense[i * n + j];
            }
        }

        // Random RHS
        let rhs: Vec<f64> = (0..n).map(|_| rng.random::<f64>() * 10.0 - 5.0).collect();

        (dense, band, rhs)
    }

    #[test]
    fn band_cholesky_residual_check() {
        // Test on multiple random SPD band systems of various sizes.
        // Verify ||Ax - b|| / ||b|| < tolerance.
        for &(n, kd) in &[(10, 3), (20, 5), (50, 11), (100, 11), (200, 15)] {
            let (dense, band, rhs) = random_spd_band(n, kd, 42 + n as u64);

            // Solve with band Cholesky
            let mut band_copy = band.clone();
            let mut x = rhs.clone();
            assert!(solve_spd_band(n, kd, &mut band_copy, &mut x),
                "band Cholesky failed for n={}, kd={}", n, kd);

            // Compute residual: r = A*x - b
            let mut max_residual = 0.0f64;
            let b_norm: f64 = rhs.iter().map(|v| v * v).sum::<f64>().sqrt();
            for i in 0..n {
                let mut ax_i = 0.0;
                for j in 0..n {
                    ax_i += dense[i * n + j] * x[j];
                }
                let r = (ax_i - rhs[i]).abs();
                max_residual = max_residual.max(r);
            }
            let rel_residual = max_residual / b_norm;
            assert!(rel_residual < 1e-10,
                "n={}, kd={}: relative residual {:.2e} too large", n, kd, rel_residual);
        }
    }

    #[test]
    fn band_cholesky_f32_residual_check() {
        // Same but for f32 -- expect lower precision
        for &(n, kd) in &[(10, 3), (20, 5), (50, 11), (100, 11)] {
            let (dense_f64, band_f64, rhs_f64) = random_spd_band(n, kd, 99 + n as u64);
            let dense: Vec<f32> = dense_f64.iter().map(|&v| v as f32).collect();
            let band: Vec<f32> = band_f64.iter().map(|&v| v as f32).collect();
            let rhs: Vec<f32> = rhs_f64.iter().map(|&v| v as f32).collect();

            let mut band_copy = band.clone();
            let mut x = rhs.clone();
            assert!(solve_spd_band_f32(n, kd, &mut band_copy, &mut x),
                "f32 band Cholesky failed for n={}, kd={}", n, kd);

            let mut max_residual = 0.0f32;
            let b_norm: f32 = rhs.iter().map(|v| v * v).sum::<f32>().sqrt();
            for i in 0..n {
                let mut ax_i = 0.0f32;
                for j in 0..n {
                    ax_i += dense[i * n + j] * x[j];
                }
                let r = (ax_i - rhs[i]).abs();
                max_residual = max_residual.max(r);
            }
            let rel_residual = max_residual / b_norm;
            assert!(rel_residual < 1e-4,
                "f32 n={}, kd={}: relative residual {:.2e} too large", n, kd, rel_residual);
        }
    }

    #[test]
    #[cfg(feature = "lapack")]
    fn band_cholesky_lapack_vs_rust_large() {
        // Compare LAPACK vs Rust on larger random systems, check both
        // agree to near machine precision.
        for &(n, kd) in &[(20, 5), (50, 11), (100, 11), (200, 15)] {
            let (_dense, band, rhs) = random_spd_band(n, kd, 77 + n as u64);

            let mut band_rust = band.clone();
            let mut x_rust = rhs.clone();
            assert!(solve_spd_band(n, kd, &mut band_rust, &mut x_rust));

            let mut band_lapack = band.clone();
            let mut x_lapack = rhs.clone();
            assert!(solve_spd_band_lapack(n, kd, &mut band_lapack, &mut x_lapack));

            let mut max_diff = 0.0f64;
            let x_norm: f64 = x_rust.iter().map(|v| v * v).sum::<f64>().sqrt();
            for i in 0..n {
                max_diff = max_diff.max((x_rust[i] - x_lapack[i]).abs());
            }
            let rel_diff = max_diff / x_norm;
            assert!(rel_diff < 1e-12,
                "n={}, kd={}: Rust vs LAPACK relative diff {:.2e}", n, kd, rel_diff);
        }
    }

    #[test]
    fn coo_to_csc_basic() {
        // 3x3 upper triangle:
        // [4 1 0]
        // [  3 2]
        // [    5]
        let mut coo = CooMatrix::<f64>::new(3);
        coo.push(0, 0, 4.0);
        coo.push(0, 1, 1.0);
        coo.push(1, 1, 3.0);
        coo.push(1, 2, 2.0);
        coo.push(2, 2, 5.0);

        let csc = coo.to_csc();
        assert_eq!(csc.n, 3);
        assert_eq!(csc.col_ptr, vec![0, 1, 3, 5]);
        assert_eq!(csc.row_idx, vec![0, 0, 1, 1, 2]);
        assert_eq!(csc.vals, vec![4.0, 1.0, 3.0, 2.0, 5.0]);
        assert_eq!(csc.diag_pos, vec![0, 2, 4]);
    }

    #[test]
    fn coo_to_csc_duplicates() {
        // Same entry added twice: should sum
        let mut coo = CooMatrix::<f64>::new(2);
        coo.push(0, 0, 3.0);
        coo.push(0, 1, 1.0);
        coo.push(0, 0, 2.0); // duplicate: 3+2=5
        coo.push(1, 1, 4.0);

        let csc = coo.to_csc();
        assert_eq!(csc.vals, vec![5.0, 1.0, 4.0]);
    }

    #[test]
    fn coo_scatter_roundtrip() {
        let mut coo = CooMatrix::<f64>::new(3);
        coo.push(0, 0, 4.0);
        coo.push(0, 1, 1.0);
        coo.push(1, 1, 3.0);
        coo.push(1, 2, 2.0);
        coo.push(2, 2, 5.0);

        let csc = coo.to_csc();
        let scatter_map = coo.build_scatter_map(&csc);

        // Modify values and scatter
        coo.clear();
        coo.push(0, 0, 10.0);
        coo.push(0, 1, 20.0);
        coo.push(1, 1, 30.0);
        coo.push(1, 2, 40.0);
        coo.push(2, 2, 50.0);

        let mut csc2 = CscMatrix::<f64>::empty(3);
        csc2.col_ptr = csc.col_ptr.clone();
        csc2.row_idx = csc.row_idx.clone();
        csc2.vals = vec![0.0; csc.nnz()];
        csc2.diag_pos = csc.diag_pos.clone();

        coo.scatter_into(&mut csc2, &scatter_map);
        assert_eq!(csc2.vals, vec![10.0, 20.0, 30.0, 40.0, 50.0]);
    }

    #[test]
    fn sparse_solver_quadratic() {
        // Same quadratic as solve_quadratic but using sparse solver
        // Need to implement LmProblem manually with sparse support
        struct QuadProblem;
        impl LmProblem<f64> for QuadProblem {
            fn calc_cost(&mut self, x: &[f64]) -> f64 {
                (x[0] - 3.0) * (x[0] - 3.0) + (x[1] - 7.0) * (x[1] - 7.0)
            }
            fn calc_grad_hessian_dense(&mut self, x: &[f64], grad: &mut [f64], hess: &mut [f64]) {
                grad[0] = 2.0 * (x[0] - 3.0); grad[1] = 2.0 * (x[1] - 7.0);
                hess[0] = 2.0; hess[1] = 0.0; hess[2] = 0.0; hess[3] = 2.0;
            }
            fn calc_grad_hessian_band(&mut self, _: &[f64], _: &mut [f64], _: &mut [f64], _: usize) -> Result<(), BandError> {
                unimplemented!()
            }
            fn calc_grad_hessian_sparse_direct(&mut self, _: &[f64], _: &mut [f64], _: &mut CscMatrix<f64>) { unimplemented!() }
            fn calc_grad_hessian_sparse_indexed(&mut self, _: &[f64], _: &mut [f64], _: &mut [f64], _: &[usize]) { unimplemented!() }
            fn calc_grad_hessian_sparse(&mut self, x: &[f64], grad: &mut [f64], coo: &mut CooMatrix<f64>) {
                grad[0] = 2.0 * (x[0] - 3.0); grad[1] = 2.0 * (x[1] - 7.0);
                coo.clear();
                coo.push(0, 0, 2.0);
                coo.push(1, 1, 2.0);
            }
        }

        let result = solve_sparse(
            &[0.0, 0.0],
            &mut QuadProblem,
            &LmConfig { abs_precision: 1e-10, max_iters: 100, initial_lambda: 0.001, ..Default::default() },
        );
        assert!((result.x[0] - 3.0).abs() < 1e-6, "x={}", result.x[0]);
        assert!((result.x[1] - 7.0).abs() < 1e-6, "y={}", result.x[1]);
        assert!(result.end_cost < 1e-10);
    }

    #[test]
        fn sparse_faer_quadratic() {
        struct QuadProblem;
        impl LmProblem<f64> for QuadProblem {
            fn calc_cost(&mut self, x: &[f64]) -> f64 {
                (x[0] - 3.0) * (x[0] - 3.0) + (x[1] - 7.0) * (x[1] - 7.0)
            }
            fn calc_grad_hessian_dense(&mut self, x: &[f64], grad: &mut [f64], hess: &mut [f64]) {
                grad[0] = 2.0 * (x[0] - 3.0); grad[1] = 2.0 * (x[1] - 7.0);
                hess[0] = 2.0; hess[1] = 0.0; hess[2] = 0.0; hess[3] = 2.0;
            }
            fn calc_grad_hessian_band(&mut self, _: &[f64], _: &mut [f64], _: &mut [f64], _: usize) -> Result<(), BandError> { unimplemented!() }
            fn calc_grad_hessian_sparse_direct(&mut self, _: &[f64], _: &mut [f64], _: &mut CscMatrix<f64>) { unimplemented!() }
            fn calc_grad_hessian_sparse_indexed(&mut self, x: &[f64], grad: &mut [f64], vals: &mut [f64], positions: &[usize]) {
                grad[0] = 2.0 * (x[0] - 3.0); grad[1] = 2.0 * (x[1] - 7.0);
                vals.iter_mut().for_each(|v| *v = 0.0);
                // Same order as sparse COO push: (0,0)=2.0, (1,1)=2.0
                vals[positions[0]] += 2.0;
                vals[positions[1]] += 2.0;
            }
            fn calc_grad_hessian_sparse(&mut self, x: &[f64], grad: &mut [f64], coo: &mut CooMatrix<f64>) {
                grad[0] = 2.0 * (x[0] - 3.0); grad[1] = 2.0 * (x[1] - 7.0);
                coo.clear();
                coo.push(0, 0, 2.0);
                coo.push(1, 1, 2.0);
            }
        }

        let result = solve_sparse_faer(
            &[0.0, 0.0],
            &mut QuadProblem,
            &LmConfig { abs_precision: 1e-10, max_iters: 100, initial_lambda: 0.001, ..Default::default() },
        );
        assert!((result.x[0] - 3.0).abs() < 1e-6, "x={}", result.x[0]);
        assert!((result.x[1] - 7.0).abs() < 1e-6, "y={}", result.x[1]);
        assert!(result.end_cost < 1e-10);
    }

    #[test]
    fn csc_symv() {
        // [4 1]  * [1] = [6]
        // [1 3]    [2]   [7]
        let mut coo = CooMatrix::<f64>::new(2);
        coo.push(0, 0, 4.0);
        coo.push(0, 1, 1.0);
        coo.push(1, 1, 3.0);
        let csc = coo.to_csc();
        let mut y = vec![0.0; 2];
        csc.symv(&[1.0, 2.0], &mut y);
        assert!((y[0] - 6.0).abs() < 1e-12);
        assert!((y[1] - 7.0).abs() < 1e-12);
    }

    #[test]
    fn dense_vs_sparse_hessian_equal() {
        // 4-variable problem with off-diagonal coupling:
        // cost = (x0-1)^2 + (x1-2)^2 + (x2-3)^2 + (x3-4)^2 + (x0-x2)^2 + (x1-x3)^2
        // Dense hessian (symmetric):
        //   [4  0  -2  0]
        //   [0  4   0 -2]
        //   [-2 0   4  0]
        //   [0 -2   0  4]
        struct CoupledProblem;
        impl LmProblem<f64> for CoupledProblem {
            fn calc_cost(&mut self, x: &[f64]) -> f64 {
                (x[0]-1.0).powi(2) + (x[1]-2.0).powi(2) + (x[2]-3.0).powi(2) + (x[3]-4.0).powi(2)
                    + (x[0]-x[2]).powi(2) + (x[1]-x[3]).powi(2)
            }
            fn calc_grad_hessian_dense(&mut self, x: &[f64], g: &mut [f64], h: &mut [f64]) {
                g[0] = 2.0*(x[0]-1.0) + 2.0*(x[0]-x[2]);
                g[1] = 2.0*(x[1]-2.0) + 2.0*(x[1]-x[3]);
                g[2] = 2.0*(x[2]-3.0) - 2.0*(x[0]-x[2]);
                g[3] = 2.0*(x[3]-4.0) - 2.0*(x[1]-x[3]);
                // Row-major 4x4
                h[0*4+0]=4.0; h[0*4+1]=0.0; h[0*4+2]=-2.0; h[0*4+3]=0.0;
                h[1*4+0]=0.0; h[1*4+1]=4.0; h[1*4+2]=0.0;  h[1*4+3]=-2.0;
                h[2*4+0]=-2.0;h[2*4+1]=0.0; h[2*4+2]=4.0;  h[2*4+3]=0.0;
                h[3*4+0]=0.0; h[3*4+1]=-2.0;h[3*4+2]=0.0;  h[3*4+3]=4.0;
            }
            fn calc_grad_hessian_band(&mut self, _: &[f64], _: &mut [f64], _: &mut [f64], _: usize) -> Result<(), BandError> { unimplemented!() }
            fn calc_grad_hessian_sparse_direct(&mut self, _: &[f64], _: &mut [f64], _: &mut CscMatrix<f64>) { unimplemented!() }
            fn calc_grad_hessian_sparse_indexed(&mut self, _: &[f64], _: &mut [f64], _: &mut [f64], _: &[usize]) { unimplemented!() }
            fn calc_grad_hessian_sparse(&mut self, x: &[f64], g: &mut [f64], coo: &mut CooMatrix<f64>) {
                g[0] = 2.0*(x[0]-1.0) + 2.0*(x[0]-x[2]);
                g[1] = 2.0*(x[1]-2.0) + 2.0*(x[1]-x[3]);
                g[2] = 2.0*(x[2]-3.0) - 2.0*(x[0]-x[2]);
                g[3] = 2.0*(x[3]-4.0) - 2.0*(x[1]-x[3]);
                coo.clear();
                // Upper triangle only
                coo.push(0, 0, 4.0);
                coo.push(0, 2, -2.0);
                coo.push(1, 1, 4.0);
                coo.push(1, 3, -2.0);
                coo.push(2, 2, 4.0);
                coo.push(3, 3, 4.0);
            }
        }

        let x = [0.0, 0.0, 0.0, 0.0];
        let n = 4;

        // Dense assembly
        let mut grad_dense = vec![0.0; n];
        let mut hessian = vec![0.0; n * n];
        CoupledProblem.calc_grad_hessian_dense(&x, &mut grad_dense, &mut hessian);

        // Sparse assembly
        let mut grad_sparse = vec![0.0; n];
        let mut coo = CooMatrix::new(n);
        CoupledProblem.calc_grad_hessian_sparse(&x, &mut grad_sparse, &mut coo);
        let csc = coo.to_csc();

        // Gradients must match
        for i in 0..n {
            assert!((grad_dense[i] - grad_sparse[i]).abs() < 1e-12,
                "grad[{}]: dense={}, sparse={}", i, grad_dense[i], grad_sparse[i]);
        }

        // Convert sparse (upper triangle CSC) to dense and compare
        let mut sparse_as_dense = vec![0.0; n * n];
        for j in 0..n {
            for k in csc.col_ptr[j]..csc.col_ptr[j + 1] {
                let i = csc.row_idx[k] as usize;
                let v = csc.vals[k];
                sparse_as_dense[i * n + j] = v;
                if i != j { sparse_as_dense[j * n + i] = v; }
            }
        }

        for i in 0..n {
            for j in 0..n {
                assert!((hessian[i * n + j] - sparse_as_dense[i * n + j]).abs() < 1e-12,
                    "H[{},{}]: dense={}, sparse={}", i, j, hessian[i*n+j], sparse_as_dense[i*n+j]);
            }
        }

        // Also verify both solvers produce the same result
        let config = LmConfig { abs_precision: 1e-10, max_iters: 100, initial_lambda: 0.001, ..Default::default() };
        let r_dense = solve(&x, &mut CoupledProblem, &config);
        let r_sparse = solve_sparse(&x, &mut CoupledProblem, &config);
        for i in 0..n {
            assert!((r_dense.x[i] - r_sparse.x[i]).abs() < 1e-6,
                "x[{}]: dense={}, sparse={}", i, r_dense.x[i], r_sparse.x[i]);
        }
    }

    #[test]
    #[cfg(feature = "eigen")]
    fn sparse_eigen_quadratic() {
        struct QP;
        impl LmProblem<f64> for QP {
            fn calc_cost(&mut self, x: &[f64]) -> f64 {
                (x[0]-3.0)*(x[0]-3.0) + (x[1]-7.0)*(x[1]-7.0)
            }
            fn calc_grad_hessian_dense(&mut self, x: &[f64], g: &mut [f64], h: &mut [f64]) {
                g[0]=2.0*(x[0]-3.0); g[1]=2.0*(x[1]-7.0);
                h[0]=2.0; h[1]=0.0; h[2]=0.0; h[3]=2.0;
            }
            fn calc_grad_hessian_band(&mut self,_:&[f64],_:&mut[f64],_:&mut[f64],_:usize)->Result<(),BandError>{unimplemented!()}
            fn calc_grad_hessian_sparse_direct(&mut self,_:&[f64],_:&mut[f64],_:&mut CscMatrix<f64>){unimplemented!()}
            fn calc_grad_hessian_sparse_indexed(&mut self, x: &[f64], g: &mut [f64], vals: &mut [f64], pos: &[usize]) {
                g[0]=2.0*(x[0]-3.0); g[1]=2.0*(x[1]-7.0);
                vals.iter_mut().for_each(|v| *v = 0.0);
                vals[pos[0]] += 2.0; vals[pos[1]] += 2.0;
            }
            fn calc_grad_hessian_sparse(&mut self, x: &[f64], g: &mut [f64], coo: &mut CooMatrix<f64>) {
                g[0]=2.0*(x[0]-3.0); g[1]=2.0*(x[1]-7.0);
                coo.clear(); coo.push(0,0,2.0); coo.push(1,1,2.0);
            }
        }
        let r = solve_sparse_eigen(&[0.0,0.0], &mut QP,
            &LmConfig{abs_precision:1e-10, max_iters:100, initial_lambda:0.001, verbose:false});
        assert!((r.x[0]-3.0).abs()<1e-6, "x={}", r.x[0]);
        assert!((r.x[1]-7.0).abs()<1e-6, "y={}", r.x[1]);
        assert!(r.end_cost < 1e-10);
    }

    #[test]
    #[cfg(feature = "eigen")]
    fn sparse_eigen_coupled() {
        // 4-var coupled, compare with dense
        struct CP;
        impl LmProblem<f64> for CP {
            fn calc_cost(&mut self, x: &[f64]) -> f64 {
                (x[0]-1.0).powi(2)+(x[1]-2.0).powi(2)+(x[2]-3.0).powi(2)+(x[3]-4.0).powi(2)
                    +(x[0]-x[2]).powi(2)+(x[1]-x[3]).powi(2)
            }
            fn calc_grad_hessian_dense(&mut self, x: &[f64], g: &mut [f64], h: &mut [f64]) {
                g[0]=2.0*(x[0]-1.0)+2.0*(x[0]-x[2]); g[1]=2.0*(x[1]-2.0)+2.0*(x[1]-x[3]);
                g[2]=2.0*(x[2]-3.0)-2.0*(x[0]-x[2]); g[3]=2.0*(x[3]-4.0)-2.0*(x[1]-x[3]);
                h[0]=4.0;h[1]=0.0;h[2]=-2.0;h[3]=0.0;
                h[4]=0.0;h[5]=4.0;h[6]=0.0;h[7]=-2.0;
                h[8]=-2.0;h[9]=0.0;h[10]=4.0;h[11]=0.0;
                h[12]=0.0;h[13]=-2.0;h[14]=0.0;h[15]=4.0;
            }
            fn calc_grad_hessian_band(&mut self,_:&[f64],_:&mut[f64],_:&mut[f64],_:usize)->Result<(),BandError>{unimplemented!()}
            fn calc_grad_hessian_sparse_direct(&mut self,_:&[f64],_:&mut[f64],_:&mut CscMatrix<f64>){unimplemented!()}
            fn calc_grad_hessian_sparse_indexed(&mut self, x: &[f64], g: &mut [f64], vals: &mut [f64], pos: &[usize]) {
                g[0]=2.0*(x[0]-1.0)+2.0*(x[0]-x[2]); g[1]=2.0*(x[1]-2.0)+2.0*(x[1]-x[3]);
                g[2]=2.0*(x[2]-3.0)-2.0*(x[0]-x[2]); g[3]=2.0*(x[3]-4.0)-2.0*(x[1]-x[3]);
                vals.iter_mut().for_each(|v| *v = 0.0);
                vals[pos[0]] += 4.0; vals[pos[1]] += -2.0;
                vals[pos[2]] += 4.0; vals[pos[3]] += -2.0;
                vals[pos[4]] += 4.0; vals[pos[5]] += 4.0;
            }
            fn calc_grad_hessian_sparse(&mut self, x: &[f64], g: &mut [f64], coo: &mut CooMatrix<f64>) {
                g[0]=2.0*(x[0]-1.0)+2.0*(x[0]-x[2]); g[1]=2.0*(x[1]-2.0)+2.0*(x[1]-x[3]);
                g[2]=2.0*(x[2]-3.0)-2.0*(x[0]-x[2]); g[3]=2.0*(x[3]-4.0)-2.0*(x[1]-x[3]);
                coo.clear();
                coo.push(0,0,4.0); coo.push(0,2,-2.0);
                coo.push(1,1,4.0); coo.push(1,3,-2.0);
                coo.push(2,2,4.0); coo.push(3,3,4.0);
            }
        }
        let cfg = LmConfig{abs_precision:1e-10, max_iters:100, initial_lambda:0.001, verbose:false};
        let rd = solve(&[0.0;4], &mut CP, &cfg);
        let re = solve_sparse_eigen(&[0.0;4], &mut CP, &cfg);
        for i in 0..4 {
            assert!((rd.x[i]-re.x[i]).abs()<1e-6, "x[{}]: dense={}, eigen={}", i, rd.x[i], re.x[i]);
        }
    }

    #[test]
    #[cfg(feature = "cholmod")]
    fn sparse_cholmod_quadratic() {
        struct QP;
        impl LmProblem<f64> for QP {
            fn calc_cost(&mut self, x: &[f64]) -> f64 {
                (x[0]-3.0)*(x[0]-3.0) + (x[1]-7.0)*(x[1]-7.0)
            }
            fn calc_grad_hessian_dense(&mut self, x: &[f64], g: &mut [f64], h: &mut [f64]) {
                g[0]=2.0*(x[0]-3.0); g[1]=2.0*(x[1]-7.0);
                h[0]=2.0; h[1]=0.0; h[2]=0.0; h[3]=2.0;
            }
            fn calc_grad_hessian_band(&mut self,_:&[f64],_:&mut[f64],_:&mut[f64],_:usize)->Result<(),BandError>{unimplemented!()}
            fn calc_grad_hessian_sparse_direct(&mut self,_:&[f64],_:&mut[f64],_:&mut CscMatrix<f64>){unimplemented!()}
            fn calc_grad_hessian_sparse_indexed(&mut self, x: &[f64], g: &mut [f64], vals: &mut [f64], pos: &[usize]) {
                g[0]=2.0*(x[0]-3.0); g[1]=2.0*(x[1]-7.0);
                vals.iter_mut().for_each(|v| *v = 0.0);
                vals[pos[0]] += 2.0; vals[pos[1]] += 2.0;
            }
            fn calc_grad_hessian_sparse(&mut self, x: &[f64], g: &mut [f64], coo: &mut CooMatrix<f64>) {
                g[0]=2.0*(x[0]-3.0); g[1]=2.0*(x[1]-7.0);
                coo.clear(); coo.push(0,0,2.0); coo.push(1,1,2.0);
            }
        }
        let r = solve_sparse_cholmod(&[0.0,0.0], &mut QP,
            &LmConfig{abs_precision:1e-10, max_iters:100, initial_lambda:0.001, verbose:false});
        assert!((r.x[0]-3.0).abs()<1e-6, "x={}", r.x[0]);
        assert!((r.x[1]-7.0).abs()<1e-6, "y={}", r.x[1]);
        assert!(r.end_cost < 1e-10);
    }
}