echidna-optim 0.13.0

//! Sparse implicit differentiation via the Implicit Function Theorem.
//!
//! Exploits structural sparsity in the Jacobian `∂F/∂z` (the state block) to compute
//! implicit derivatives more efficiently for large-scale systems where F_z is sparse
//! (e.g., PDE discretizations, banded systems).
//!
//! Uses echidna's sparse Jacobian infrastructure (sparsity detection, graph coloring,
//! compressed evaluation) combined with faer's sparse LU solver.
//!
//! # Overview
//!
//! Given a residual function `F: R^(m+n) → R^m` with `F(z*, x) = 0`, the three public
//! functions mirror their dense counterparts in [`crate::implicit`]:
//!
//! - [`implicit_tangent_sparse`]: `dz*/dx · ẋ` via sparse LU solve
//! - [`implicit_adjoint_sparse`]: `(dz*/dx)^T · z̄` via sparse transpose solve
//! - [`implicit_jacobian_sparse`]: full `dz*/dx` matrix via column-wise solves
//!
//! The [`SparseImplicitContext`] precomputes sparsity pattern, graph coloring, and
//! COO index partitions so they can be reused across multiple evaluation points.

use std::fmt;

use echidna::sparse::{column_coloring, row_coloring, JacobianSparsityPattern};
use echidna::BytecodeTape;

use faer::linalg::solvers::Solve;
use faer::sparse::SparseColMat;
use faer::Col;

/// Reason a sparse implicit-differentiation solve failed.
///
/// Marked `#[non_exhaustive]` so future variants can be added without
/// breaking exhaustive `match`es. Numeric fields use `f64` to match the
/// monomorphic `f64` API of this module.
#[non_exhaustive]
#[derive(Debug)]
pub enum SparseImplicitError {
    /// Constructing the sparse F_z matrix from triplets failed (e.g.
    /// duplicate entries or pattern mismatch). Almost always indicates
    /// a bug in the sparsity-pattern computation rather than user input.
    /// The underlying `faer::sparse::CreationError` is available via
    /// [`std::error::Error::source`].
    StructuralSingular {
        source: Box<dyn std::error::Error + Send + Sync + 'static>,
    },
    /// faer's sparse LU factorization itself returned an error
    /// (typically signals an exact zero pivot detected during
    /// elimination — i.e. F_z is exactly singular). The underlying
    /// `faer::sparse::linalg::LuError` is available via
    /// [`std::error::Error::source`] and typically surfaces as
    /// `SymbolicSingular { index }`, pinpointing the failing column.
    FactorSingular {
        source: Box<dyn std::error::Error + Send + Sync + 'static>,
    },
    /// LU factorization succeeded but the solve produced a non-finite
    /// solution to the mixed-sign probe — F_z is numerically singular
    /// to working precision in the probed direction.
    NumericSingular,
    /// LU solve produced a finite solution, but
    /// `||F_z·x − rhs|| / ||rhs|| = relative_residual` exceeds
    /// `tolerance` (which is `sqrt(eps) · sqrt(dimension)`) — F_z is
    /// finite but effectively rank-deficient or extremely
    /// ill-conditioned.
    ResidualExceeded {
        relative_residual: f64,
        tolerance: f64,
        dimension: usize,
    },
    /// A runtime-supplied vector argument to a public `implicit_*_sparse`
    /// fn had an unexpected length. `field` names the argument (e.g.
    /// `"z_star"`, `"x"`, `"x_dot"`, `"z_bar"`), `expected` is the
    /// length the API requires (typically `ctx.num_states()` or
    /// `ctx.num_params()`), `actual` is the length supplied.
    ///
    /// Construction-time mismatches (`SparseImplicitContext::new`)
    /// continue to panic — those are programmer-contract violations
    /// at context build time, not recoverable runtime failures.
    DimensionMismatch {
        field: &'static str,
        expected: usize,
        actual: usize,
    },
}

impl fmt::Display for SparseImplicitError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            SparseImplicitError::StructuralSingular { .. } => {
                write!(f, "sparse_implicit: failed to build sparse F_z matrix")
            }
            SparseImplicitError::FactorSingular { .. } => {
                write!(
                    f,
                    "sparse_implicit: sparse LU factorization failed (F_z singular)"
                )
            }
            SparseImplicitError::NumericSingular => {
                write!(
                    f,
                    "sparse_implicit: probe solve produced non-finite solution (F_z numerically singular)"
                )
            }
            SparseImplicitError::ResidualExceeded {
                relative_residual,
                tolerance,
                dimension,
            } => {
                write!(
                    f,
                    "sparse_implicit: probe solve residual {relative_residual:.3e} exceeds tolerance {tolerance:.3e} (F_z ill-conditioned, dim = {dimension})"
                )
            }
            SparseImplicitError::DimensionMismatch {
                field,
                expected,
                actual,
            } => {
                write!(
                    f,
                    "sparse_implicit: dimension mismatch for `{field}` (expected {expected}, got {actual})"
                )
            }
        }
    }
}

impl std::error::Error for SparseImplicitError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        // `&**source` double-derefs `&Box<dyn Trait>` to `&(dyn Trait)`,
        // coercing through the `Send + Sync` bounds to the plain
        // `dyn Error + 'static` that `source()` expects. Equivalent to
        // relying on trait-upcasting (stable since Rust 1.86) but robust
        // across toolchain quirks.
        //
        // Non-sourced variants are spelled out explicitly (rather than a
        // wildcard `_`) so that a future `#[non_exhaustive]` variant
        // carrying a `source` field produces a compiler error here — the
        // wildcard would silently swallow its chain and defeat the
        // whole point of `source()`.
        match self {
            Self::StructuralSingular { source } | Self::FactorSingular { source } => {
                Some(&**source)
            }
            Self::NumericSingular
            | Self::ResidualExceeded { .. }
            | Self::DimensionMismatch { .. } => None,
        }
    }
}

echidna::assert_send_sync!(SparseImplicitError);

/// Precomputed context for sparse implicit differentiation.
///
/// Stores the sparsity pattern, graph coloring, and index partitions for the full
/// m×(m+n) Jacobian. Construct once per tape, reuse across evaluation points.
pub struct SparseImplicitContext {
    /// Full m×(m+n) Jacobian sparsity pattern.
    pattern: JacobianSparsityPattern,
    /// Column colors (forward mode) or row colors (reverse mode).
    colors: Vec<u32>,
    /// Number of distinct colors.
    num_colors: u32,
    /// Whether to use forward mode (column coloring) for Jacobian computation.
    forward_mode: bool,
    /// Number of state variables (m).
    num_states: usize,
    /// Number of parameters (n = num_inputs - num_states).
    num_params: usize,
    /// COO indices where col < m (F_z entries).
    fz_indices: Vec<usize>,
    /// COO indices where col >= m (F_x entries).
    fx_indices: Vec<usize>,
    /// `fx_indices` grouped by parameter column: `fx_by_col[j]` contains indices
    /// into the COO arrays where `col == m + j`.
    fx_by_col: Vec<Vec<usize>>,
}

impl SparseImplicitContext {
    /// Create a new sparse implicit context from a multi-output tape.
    ///
    /// # Arguments
    ///
    /// * `tape` - A bytecode tape representing `F: R^(m+n) → R^m`
    /// * `num_states` - Number of state variables `m` (first `m` inputs are states)
    ///
    /// # Panics
    ///
    /// Panics if `tape.num_outputs() != num_states` or `tape.num_inputs() <= num_states`.
    pub fn new(tape: &BytecodeTape<f64>, num_states: usize) -> Self {
        let m = num_states;
        assert_eq!(
            tape.num_outputs(),
            m,
            "tape.num_outputs() ({}) must equal num_states ({})",
            tape.num_outputs(),
            m
        );
        assert!(
            tape.num_inputs() > m,
            "tape.num_inputs() ({}) must be greater than num_states ({})",
            tape.num_inputs(),
            m
        );
        let n = tape.num_inputs() - m;

        // Detect full Jacobian sparsity
        let pattern = tape.detect_jacobian_sparsity();

        // Try both colorings, pick fewer colors
        let (col_colors, col_ncolors) = column_coloring(&pattern);
        let (row_colors, row_ncolors) = row_coloring(&pattern);

        let (colors, num_colors, forward_mode) = if col_ncolors <= row_ncolors {
            (col_colors, col_ncolors, true)
        } else {
            (row_colors, row_ncolors, false)
        };

        // Partition COO indices into F_z and F_x blocks
        let mut fz_indices = Vec::new();
        let mut fx_indices = Vec::new();
        let mut fx_by_col: Vec<Vec<usize>> = vec![Vec::new(); n];

        for k in 0..pattern.nnz() {
            let col = pattern.cols[k] as usize;
            if col < m {
                fz_indices.push(k);
            } else {
                fx_indices.push(k);
                fx_by_col[col - m].push(k);
            }
        }

        SparseImplicitContext {
            pattern,
            colors,
            num_colors,
            forward_mode,
            num_states: m,
            num_params: n,
            fz_indices,
            fx_indices,
            fx_by_col,
        }
    }

    /// Number of state variables (m).
    pub fn num_states(&self) -> usize {
        self.num_states
    }

    /// Number of parameters (n).
    pub fn num_params(&self) -> usize {
        self.num_params
    }

    /// Number of structural non-zeros in the full Jacobian pattern.
    pub fn nnz(&self) -> usize {
        self.pattern.nnz()
    }

    /// Number of structural non-zeros in the F_z block.
    pub fn fz_nnz(&self) -> usize {
        self.fz_indices.len()
    }

    /// Number of structural non-zeros in the F_x block.
    pub fn fx_nnz(&self) -> usize {
        self.fx_indices.len()
    }
}

// ══════════════════════════════════════════════
//  Private helpers
// ══════════════════════════════════════════════

/// Extract F_z triplets from COO Jacobian values.
fn extract_fz_triplets(
    ctx: &SparseImplicitContext,
    jac_values: &[f64],
) -> Vec<faer::sparse::Triplet<usize, usize, f64>> {
    ctx.fz_indices
        .iter()
        .map(|&k| faer::sparse::Triplet {
            row: ctx.pattern.rows[k] as usize,
            col: ctx.pattern.cols[k] as usize,
            val: jac_values[k],
        })
        .collect()
}

/// Build sparse F_z and compute LU factorization.
///
/// Returns `Err` if matrix construction or LU factorization fails, if
/// the probe solve produces non-finite output, or if the relative
/// residual exceeds the tolerance. Numeric singularity is detected by
/// solving a test system with a mixed-sign RHS — this avoids the null-
/// space direction an all-ones vector would hit for matrices like
/// `diag(1,-1)` rotated into the state basis.
fn build_fz_and_factor(
    ctx: &SparseImplicitContext,
    jac_values: &[f64],
) -> Result<faer::sparse::linalg::solvers::Lu<usize, f64>, SparseImplicitError> {
    let m = ctx.num_states;
    let triplets = extract_fz_triplets(ctx, jac_values);
    let mat = SparseColMat::<usize, f64>::try_new_from_triplets(m, m, &triplets).map_err(|e| {
        SparseImplicitError::StructuralSingular {
            source: Box::new(e),
        }
    })?;
    let lu = mat
        .sp_lu()
        .map_err(|e| SparseImplicitError::FactorSingular {
            source: Box::new(e),
        })?;

    // Mixed-sign probe RHS — avoids the null-space direction an all-ones
    // vector would hit for matrices like `diag(1,-1)` rotated into the
    // state basis.
    let test_rhs_vec: Vec<f64> = (0..m)
        .map(|i| if i % 2 == 0 { 1.0 } else { -1.0 })
        .collect();
    let test_rhs = Col::<f64>::from_fn(m, |i| test_rhs_vec[i]);
    let test_sol = lu.solve(&test_rhs);
    if (0..m).any(|i| !test_sol[i].is_finite()) {
        return Err(SparseImplicitError::NumericSingular);
    }

    // Residual check: compute `F_z · test_sol` via the COO F_z entries and
    // compare against `test_rhs`. This is a defensive check that catches
    // LU-solver failures which produce a finite-but-wrong solution
    // (e.g., internal numerical issues that bypass faer's own singularity
    // detection). For faer's well-tested sparse LU the residual is
    // generally near machine-eps even on ill-conditioned matrices — the
    // primary signal against singularity is the `sp_lu()` error above.
    let test_sol_slice: Vec<f64> = (0..m).map(|i| test_sol[i]).collect();
    let mut applied = vec![0.0_f64; m];
    for &k in &ctx.fz_indices {
        let row = ctx.pattern.rows[k] as usize;
        let col = ctx.pattern.cols[k] as usize;
        applied[row] += jac_values[k] * test_sol_slice[col];
    }
    let mut resid_sq = 0.0_f64;
    let mut rhs_sq = 0.0_f64;
    for i in 0..m {
        let r = applied[i] - test_rhs_vec[i];
        resid_sq += r * r;
        rhs_sq += test_rhs_vec[i] * test_rhs_vec[i];
    }
    // Tolerance: sqrt of machine epsilon scaled by dimension. Permissive
    // enough that a correctly factored faer LU passes, strict enough that
    // a solve producing residuals many orders of magnitude above machine
    // precision is caught.
    let tol = (f64::EPSILON.sqrt()) * (m as f64).sqrt();
    if !resid_sq.is_finite() {
        return Err(SparseImplicitError::NumericSingular);
    }
    if resid_sq > tol * tol * rhs_sq {
        let relative_residual = (resid_sq / rhs_sq).sqrt();
        return Err(SparseImplicitError::ResidualExceeded {
            relative_residual,
            tolerance: tol,
            dimension: m,
        });
    }

    Ok(lu)
}

/// Compute F_x · v by iterating COO entries.
fn fx_matvec(ctx: &SparseImplicitContext, jac_values: &[f64], v: &[f64]) -> Vec<f64> {
    let m = ctx.num_states;
    let mut result = vec![0.0; m];
    for &k in &ctx.fx_indices {
        let row = ctx.pattern.rows[k] as usize;
        let col = ctx.pattern.cols[k] as usize - m;
        result[row] += jac_values[k] * v[col];
    }
    result
}

/// Compute F_x^T · v by iterating COO entries.
fn fx_transpose_matvec(ctx: &SparseImplicitContext, jac_values: &[f64], v: &[f64]) -> Vec<f64> {
    let m = ctx.num_states;
    let n = ctx.num_params;
    let mut result = vec![0.0; n];
    for &k in &ctx.fx_indices {
        let row = ctx.pattern.rows[k] as usize;
        let col = ctx.pattern.cols[k] as usize - m;
        result[col] += jac_values[k] * v[row];
    }
    result
}

/// Compute the sparse Jacobian at the given evaluation point.
///
/// Returns `(outputs, jac_values)` where `jac_values` are in COO order matching `ctx.pattern`.
fn compute_sparse_jacobian(
    tape: &mut BytecodeTape<f64>,
    z_star: &[f64],
    x: &[f64],
    ctx: &SparseImplicitContext,
) -> (Vec<f64>, Vec<f64>) {
    let mut inputs = Vec::with_capacity(ctx.num_states + ctx.num_params);
    inputs.extend_from_slice(z_star);
    inputs.extend_from_slice(x);

    let (outputs, jac_values) = tape.sparse_jacobian_with_pattern(
        &inputs,
        &ctx.pattern,
        &ctx.colors,
        ctx.num_colors,
        ctx.forward_mode,
    );

    // Debug check: warn if residual is not near zero
    #[cfg(debug_assertions)]
    {
        let norm_sq: f64 = outputs.iter().map(|v| v * v).sum();
        let norm = norm_sq.sqrt();
        if norm > 1e-6 {
            eprintln!(
                "WARNING: sparse implicit differentiation called with ||F(z*, x)|| = {:.6e} > 1e-6. \
                 Derivatives may be meaningless if z* is not a root.",
                norm
            );
        }
    }

    (outputs, jac_values)
}

/// Convert a `Col<f64>` to a `Vec<f64>`.
fn col_to_vec(col: &Col<f64>, len: usize) -> Vec<f64> {
    (0..len).map(|i| col[i]).collect()
}

// ══════════════════════════════════════════════
//  Public API
// ══════════════════════════════════════════════

/// Compute the implicit tangent `dz*/dx · ẋ` using sparse Jacobian evaluation and sparse LU.
///
/// This is the sparse analogue of [`crate::implicit_tangent`]. It solves:
///
///   `F_z · ż = -(F_x · ẋ)`
///
/// where F_z and F_x are computed via graph-coloring-compressed forward/reverse passes,
/// and F_z is factorized using faer's sparse LU.
///
/// Returns `Err(SparseImplicitError)` if F_z is singular or
/// numerically degenerate; the variant pinpoints which check failed.
///
/// Runtime vector-length mismatches (`z_star`, `x`, `x_dot`, or
/// `z_bar` not matching `ctx.num_states()` / `ctx.num_params()`)
/// surface as `Err(SparseImplicitError::DimensionMismatch)`. The
/// tape-shape contract is still checked at `SparseImplicitContext::new`
/// construction time, where mismatches panic — those are programmer
/// errors, not recoverable runtime failures.
pub fn implicit_tangent_sparse(
    tape: &mut BytecodeTape<f64>,
    z_star: &[f64],
    x: &[f64],
    x_dot: &[f64],
    ctx: &SparseImplicitContext,
) -> Result<Vec<f64>, SparseImplicitError> {
    let m = ctx.num_states;
    let n = ctx.num_params;
    if z_star.len() != m {
        return Err(SparseImplicitError::DimensionMismatch {
            field: "z_star",
            expected: m,
            actual: z_star.len(),
        });
    }
    if x.len() != n {
        return Err(SparseImplicitError::DimensionMismatch {
            field: "x",
            expected: n,
            actual: x.len(),
        });
    }
    if x_dot.len() != n {
        return Err(SparseImplicitError::DimensionMismatch {
            field: "x_dot",
            expected: n,
            actual: x_dot.len(),
        });
    }

    let (_outputs, jac_values) = compute_sparse_jacobian(tape, z_star, x, ctx);

    // F_x · ẋ
    let fx_xdot = fx_matvec(ctx, &jac_values, x_dot);

    // Sparse LU factorize F_z
    let lu = build_fz_and_factor(ctx, &jac_values)?;

    // Solve F_z · ż = -(F_x · ẋ)
    let rhs = Col::<f64>::from_fn(m, |i| -fx_xdot[i]);
    let sol = lu.solve(&rhs);

    Ok(col_to_vec(&sol, m))
}

/// Compute the implicit adjoint `(dz*/dx)^T · z̄` using sparse Jacobian evaluation and sparse LU.
///
/// This is the sparse analogue of [`crate::implicit_adjoint`]. It solves:
///
///   `F_z^T · λ = z̄`
///
/// then computes `x̄ = -F_x^T · λ`.
///
/// Returns `Err(SparseImplicitError)` if F_z is singular or
/// numerically degenerate; the variant pinpoints which check failed.
///
/// Runtime vector-length mismatches (`z_star`, `x`, `x_dot`, or
/// `z_bar` not matching `ctx.num_states()` / `ctx.num_params()`)
/// surface as `Err(SparseImplicitError::DimensionMismatch)`. The
/// tape-shape contract is still checked at `SparseImplicitContext::new`
/// construction time, where mismatches panic — those are programmer
/// errors, not recoverable runtime failures.
pub fn implicit_adjoint_sparse(
    tape: &mut BytecodeTape<f64>,
    z_star: &[f64],
    x: &[f64],
    z_bar: &[f64],
    ctx: &SparseImplicitContext,
) -> Result<Vec<f64>, SparseImplicitError> {
    let m = ctx.num_states;
    let n = ctx.num_params;
    if z_star.len() != m {
        return Err(SparseImplicitError::DimensionMismatch {
            field: "z_star",
            expected: m,
            actual: z_star.len(),
        });
    }
    if x.len() != n {
        return Err(SparseImplicitError::DimensionMismatch {
            field: "x",
            expected: n,
            actual: x.len(),
        });
    }
    if z_bar.len() != m {
        return Err(SparseImplicitError::DimensionMismatch {
            field: "z_bar",
            expected: m,
            actual: z_bar.len(),
        });
    }

    let (_outputs, jac_values) = compute_sparse_jacobian(tape, z_star, x, ctx);

    // Sparse LU factorize F_z
    let lu = build_fz_and_factor(ctx, &jac_values)?;

    // Solve F_z^T · λ = z̄
    let rhs = Col::<f64>::from_fn(m, |i| z_bar[i]);
    let lambda = lu.solve_transpose(&rhs);
    let lambda_vec = col_to_vec(&lambda, m);

    // x̄ = -F_x^T · λ
    let fx_t_lambda = fx_transpose_matvec(ctx, &jac_values, &lambda_vec);
    let x_bar: Vec<f64> = fx_t_lambda.iter().map(|&v| -v).collect();

    Ok(x_bar)
}

/// Compute the full implicit Jacobian `dz*/dx` (m × n) using sparse LU.
///
/// This is the sparse analogue of [`crate::implicit_jacobian`]. It solves:
///
///   `F_z · col_j = -F_x[:, j]`   for j = 0..n
///
/// using a single sparse LU factorization of F_z, with column extraction via
/// the pre-grouped `fx_by_col` index.
///
/// Returns a dense m×n matrix since `dz*/dx` has no sparsity guarantee.
/// Returns `Err(SparseImplicitError)` if F_z is singular or
/// numerically degenerate; the variant pinpoints which check failed.
///
/// Runtime vector-length mismatches (`z_star`, `x`, `x_dot`, or
/// `z_bar` not matching `ctx.num_states()` / `ctx.num_params()`)
/// surface as `Err(SparseImplicitError::DimensionMismatch)`. The
/// tape-shape contract is still checked at `SparseImplicitContext::new`
/// construction time, where mismatches panic — those are programmer
/// errors, not recoverable runtime failures.
pub fn implicit_jacobian_sparse(
    tape: &mut BytecodeTape<f64>,
    z_star: &[f64],
    x: &[f64],
    ctx: &SparseImplicitContext,
) -> Result<Vec<Vec<f64>>, SparseImplicitError> {
    let m = ctx.num_states;
    let n = ctx.num_params;
    if z_star.len() != m {
        return Err(SparseImplicitError::DimensionMismatch {
            field: "z_star",
            expected: m,
            actual: z_star.len(),
        });
    }
    if x.len() != n {
        return Err(SparseImplicitError::DimensionMismatch {
            field: "x",
            expected: n,
            actual: x.len(),
        });
    }

    let (_outputs, jac_values) = compute_sparse_jacobian(tape, z_star, x, ctx);

    // Single factorization
    let lu = build_fz_and_factor(ctx, &jac_values)?;

    // Solve for each column of -F_x
    let mut result = vec![vec![0.0; n]; m];
    for (j, fx_col_indices) in ctx.fx_by_col.iter().enumerate() {
        // Build -F_x[:, j] using pre-grouped indices
        let mut neg_col = vec![0.0; m];
        for &k in fx_col_indices {
            let row = ctx.pattern.rows[k] as usize;
            neg_col[row] -= jac_values[k];
        }

        let rhs = Col::<f64>::from_fn(m, |i| neg_col[i]);
        let sol = lu.solve(&rhs);

        for i in 0..m {
            result[i][j] = sol[i];
        }
    }

    Ok(result)
}