eunoia 0.8.0 - Docs.rs

//! Final layout optimization.
//!
//! This module implements the second optimization step that refines the initial
//! layout by minimizing the difference between target exclusive areas and actual
//! fitted areas in the diagram.

use argmin::core::{CostFunction, Error, Executor, Gradient, Hessian, State};
use argmin::solver::linesearch::MoreThuenteLineSearch;
use argmin::solver::neldermead::NelderMead;
use argmin::solver::quasinewton::LBFGS;
use finitediff::vec;
use levenberg_marquardt::{LeastSquaresProblem, LevenbergMarquardt};
use nalgebra::storage::Owned;
use nalgebra::{DMatrix, DVector, Dyn};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::cell::RefCell;
use std::collections::HashMap;

use crate::geometry::diagram::RegionMask;
use crate::geometry::traits::DiagramShape;
use crate::loss::LossType;
use crate::spec::PreprocessedSpec;

/// Optimizer to use for final layout optimization.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum Optimizer {
    /// Levenberg-Marquardt with analytic Jacobian. Specialised for the
    /// sum-of-squares loss `LossType::SumSquared`. When configured with a
    /// non-LSQ loss, the dispatch falls back to `Lbfgs` for smooth losses
    /// (RMSE, Stress, …) and to `NelderMead` for non-smooth ones (`Max*`,
    /// `SumAbsolute`, `DiagError`, `SumAbsoluteRegionError`) — the latter
    /// produce zero/discontinuous gradients that stall L-BFGS (issue #45).
    ///
    /// Uses the existing analytical region-area gradients to assemble a
    /// per-residual Jacobian (one row per region mask), then approximates
    /// the Hessian as `JᵀJ`. The Jacobian comes from
    /// [`crate::geometry::traits::DiagramShape::compute_exclusive_regions_with_gradient`];
    /// boundary builders for Circle and Ellipse are documented in `AGENTS.md`.
    LevenbergMarquardt,
    /// L-BFGS. Used as the non-LSQ fallback under the LM dispatch arm and
    /// available directly for losses where LM doesn't apply (RMSE, Stress,
    /// DiagError, …). With analytical gradients it's competitive with LM on
    /// circles, but ~20 orders of magnitude worse on ellipse median loss in
    /// `examples/quality_report` — keep it as a fallback, not as a default.
    Lbfgs,
    /// Nelder-Mead simplex (derivative-free). Last-resort option for losses
    /// where neither LM nor a meaningful gradient is available; kept mostly
    /// because the harness still uses it as a quality lower-bound sentinel.
    NelderMead,
    /// Threshold-fired CMA-ES global escape followed by Levenberg-Marquardt
    /// polish. Default for [`Fitter`].
    ///
    /// Each restart runs plain LM first; if the result is at or below
    /// `Fitter::cmaes_fallback_threshold` (default `1e-3` on the default
    /// `SumSquared` loss) the CMA-ES step is skipped entirely
    /// and the LM result is returned.
    /// Easy specs that LM already crushes pay zero extra wall time.
    ///
    /// When LM stalls above the threshold (e.g. `issue92_3_set_dropped_pair`
    /// at `1.3e-4`, `eulerape_3_set` at `4.4e-4`, `random_4_set` at
    /// `8.5e-3` under plain LM), CMA-ES fires: it samples in a feasible
    /// box around the MDS init (centroid ± span on positions,
    /// `[eps, k·max_radius]` on radii / semi-axes; angles unbounded) and
    /// hands its best point to the same Levenberg-Marquardt residual
    /// problem the `LevenbergMarquardt` arm uses, so the analytical
    /// Jacobian still drives the final tightening. The lower-loss of
    /// {plain LM, CMA-ES → LM polish} is returned, so the path is
    /// strictly non-regressing vs `LevenbergMarquardt`.
    ///
    /// Restricted to the `SumSquared` loss (the LM polish requires it);
    /// non-LSQ losses fall back to L-BFGS for the polish step.
    ///
    /// Cost: when CMA-ES fires, ~λ·max_iters extra function evaluations
    /// on top of LM, with λ = `4 + floor(3 ln n)` for an n-parameter
    /// problem and `max_iters = 100`. On `issue91_6_set` (n=30) that's
    /// ~1400 extra region-area evals per restart. The threshold gate keeps
    /// this off the easy-spec budget.
    ///
    /// [`Fitter`]: crate::Fitter
    #[default]
    CmaEsLm,
}

/// Configuration for final layout optimization.
#[derive(Debug, Clone)]
pub(crate) struct FinalLayoutConfig {
    /// Maximum number of optimization iterations
    pub max_iterations: usize,
    /// Loss function
    pub loss_type: crate::loss::LossType,
    /// Optimizer to use
    pub optimizer: Optimizer,
    /// Cost-change convergence tolerance. Honored by every solver except
    /// Nelder-Mead:
    /// - **L-BFGS**: passed as both `tol_grad` and `tol_cost`. Squaring the
    ///   cost tolerance backfires with central-difference gradients (FD noise
    ///   floor on cost evals is ~`sqrt(EPSILON) × |cost|`), so both share
    ///   `config.tolerance`.
    /// - **Levenberg-Marquardt**: passed as `ftol` (cost-change exit) only.
    ///   `xtol` and `gtol` use the fixed `1e-6` defaults from the
    ///   per-knob fields below.
    /// - **CmaEsLm**: passed as the CMA-ES `fn_tol` (clamped to `≥ 1e-12`),
    ///   plus the LM polish step uses it as above.
    /// - **Nelder-Mead**: ignored — argmin 0.11 doesn't expose a tolerance
    ///   setter, so it runs to `max_iterations`.
    ///
    /// Default `1e-3`, validated against the full 27-spec corpus × 16 seeds
    /// × {Circle, Ellipse} via the `final_tolerance` bench (zero regressions
    /// vs. the prior `1e-6`, with up to ~170× wall-time wins on slow ellipse
    /// specs because LM was previously cost-converged but still grinding the
    /// `xtol`/`gtol` tail). The LM `xtol`/`gtol` knobs stay at `1e-6` to
    /// preserve parameter-space precision.
    pub tolerance: f64,
    /// Per-knob LM stopping tolerance overrides. `None` falls back to the
    /// LM-specific defaults: `xtol = 1e-6`, `ftol = config.tolerance`,
    /// `gtol = 1e-6`. Only honoured by `Optimizer::LevenbergMarquardt` and
    /// the LM polish step of `Optimizer::CmaEsLm`; other optimizers ignore
    /// these.
    pub xtol: Option<f64>,
    pub ftol: Option<f64>,
    pub gtol: Option<f64>,
    /// Seed used for stochastic operations: simulated annealing, and
    /// position-perturbation restarts.
    pub seed: u64,
    /// Number of optimization restarts from perturbed initial circle layouts.
    /// Attempt 0 always uses the unperturbed MDS init; attempts `1..n_restarts`
    /// perturb the circle positions before converting to shape parameters.
    /// Mirrors eulerr's `n_restarts = 10` strategy.
    pub n_restarts: usize,
    /// Loss threshold above which `Optimizer::CmaEsLm` fires the global
    /// CMA-ES escape stage. When plain LM lands at or below this value the
    /// CMA-ES step is skipped entirely, so the easy-spec wall-time cost
    /// drops to that of `Optimizer::LevenbergMarquardt`. Only the
    /// `CmaEsLm` arm consults this; other optimizers ignore it.
    pub cmaes_fallback_threshold: f64,
}

impl Default for FinalLayoutConfig {
    fn default() -> Self {
        Self {
            max_iterations: 200,
            loss_type: crate::loss::LossType::default(),
            optimizer: Optimizer::LevenbergMarquardt,
            tolerance: 1e-3,
            xtol: None,
            ftol: None,
            gtol: None,
            seed: 0xDEAD_BEEF,
            n_restarts: 10,
            // 1e-3 sits well above the ~1e-20 / ~1e-30 floor LM crushes
            // easy specs to and well below the ~1e-2 / ~1e-1 plateaus the
            // hard specs (issue91, issue92, eulerape, …) get stuck on.
            // Picked empirically from `examples/quality_report`: every
            // spec where lm_full lands at machine precision sits at or
            // below 1e-20, every spec where it stalls in a wrong basin
            // sits at or above 1e-4.
            cmaes_fallback_threshold: 1e-3,
        }
    }
}

/// Optimize the final layout by minimizing region error.
///
/// Runs the configured optimizer up to `config.n_restarts` times — attempt 0
/// from the unperturbed MDS initial layout, then attempts `1..n_restarts` from
/// MDS positions perturbed with seeded Gaussian noise. The lowest-loss result
/// across all restarts is returned. This mirrors eulerr's `n_restarts = 10`
/// strategy and helps escape topologically wrong basins where a local optimizer
/// gets stuck (e.g. shapes that should overlap landing disjoint, where the loss
/// is locally flat — see issue #28).
///
/// Returns the optimized parameters as a flat vector along with the loss.
pub(crate) fn optimize_layout<S: DiagramShape + Copy + 'static>(
    spec: &PreprocessedSpec,
    initial_positions: &[f64], // [x0, y0, x1, y1, ..., xn, yn]
    initial_radii: &[f64],     // [r0, r1, ..., rn]
    config: FinalLayoutConfig,
) -> Result<(Vec<f64>, f64), Error> {
    let n_sets = spec.n_sets;
    let params_per_shape = S::n_params();

    // Mean radius drives the perturbation scale; positions are perturbed by
    // ~N(0, mean_radius) on each axis, which is large enough to land in a
    // different basin but small enough to stay near the MDS solution.
    let mean_radius = if !initial_radii.is_empty() {
        initial_radii.iter().sum::<f64>() / initial_radii.len() as f64
    } else {
        1.0
    };

    let mut best: Option<(Vec<f64>, f64)> = None;
    let mut last_err: Option<Error> = None;
    let n_restarts = config.n_restarts.max(1);
    let mut rng = StdRng::seed_from_u64(config.seed ^ 0x52455354_41525453); // "RESTARTS"

    for attempt in 0..n_restarts {
        let mut positions = initial_positions.to_vec();
        if attempt > 0 {
            // Perturb positions with Gaussian noise. Box-Muller two-at-a-time
            // keeps this dependency-free.
            for i in 0..n_sets {
                let u1: f64 = rng.random_range(f64::EPSILON..1.0);
                let u2: f64 = rng.random_range(0.0..1.0);
                let mag = (-2.0 * u1.ln()).sqrt();
                let dx = mag * (2.0 * std::f64::consts::PI * u2).cos();
                let dy = mag * (2.0 * std::f64::consts::PI * u2).sin();
                positions[i * 2] += dx * mean_radius;
                positions[i * 2 + 1] += dy * mean_radius;
            }
        }

        let mut initial_params = Vec::with_capacity(n_sets * params_per_shape);
        for i in 0..n_sets {
            let x = positions[i * 2];
            let y = positions[i * 2 + 1];
            let r = initial_radii[i];
            initial_params.extend(S::optimizer_params_from_circle(x, y, r));
        }
        let initial_param = DVector::from_vec(initial_params);

        // A perturbed start can drive `compute_exclusive_regions` into a
        // degenerate geometry that panics inside the numeric stack
        // (e.g. cubic-equation solver with α≈0). Catch panics so one bad
        // restart doesn't abort the whole fit; a restart that panics or
        // errors is just skipped.
        let attempt_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
            optimize_from_initial::<S>(spec, &initial_param, &config)
        }));

        match attempt_result {
            Ok(Ok((params, loss))) => {
                let params_vec = params.as_slice().to_vec();
                match &best {
                    None => best = Some((params_vec, loss)),
                    Some((_, best_loss)) if loss < *best_loss => best = Some((params_vec, loss)),
                    _ => {}
                }
            }
            Ok(Err(e)) => last_err = Some(e),
            Err(_) => {
                // Panic in optimizer / cost function; skip this restart.
            }
        }
    }

    match best {
        Some(b) => Ok(b),
        None => Err(last_err.unwrap_or_else(|| {
            Error::msg("all optimization restarts failed (panicked or errored)")
        })),
    }
}

/// Run the configured optimizer once from a given initial parameter vector.
///
/// Exposed `pub(crate)` so the [`Fitter`]'s outer-loop attempts can dispatch
/// the final-stage solver directly when they have full shape parameters in
/// hand and want to skip the standard `(positions, radii) → optimizer_params_from_circle`
/// path. The Venn warm-start uses this entry point to feed canonical-Venn
/// shape params (with non-circular ellipses for n ∈ {4, 5}) straight to the
/// optimizer.
///
/// [`Fitter`]: crate::Fitter
pub(crate) fn optimize_from_initial<S: DiagramShape + Copy + 'static>(
    spec: &PreprocessedSpec,
    initial_param: &DVector<f64>,
    config: &FinalLayoutConfig,
) -> Result<(DVector<f64>, f64), Error> {
    let params_per_shape = S::n_params();

    // Choose optimizer and run based on configuration
    let (final_params, loss) = match config.optimizer {
        Optimizer::NelderMead => {
            run_nelder_mead::<S>(spec, params_per_shape, initial_param, config)?
        }
        Optimizer::Lbfgs => {
            // L-BFGS with numerical gradients. On non-smooth losses
            // (`MaxAbsolute`, `SumAbsoute`, …) gradients are unreliable —
            // pick a `LossType::Smooth*` surrogate variant instead.
            let cost_function_lbfgs = DiagramCost::<S> {
                spec,
                loss_type: config.loss_type,
                params_per_shape,
                _shape: std::marker::PhantomData,
            };
            let line_search = MoreThuenteLineSearch::new();
            // Both tolerances at `config.tolerance`. Squaring the cost
            // tolerance (the obvious "be stricter on cost than gradient"
            // trick) backfires: with central-difference gradients the FD
            // noise floor on cost evals is ~`sqrt(EPSILON) × |cost|`, well
            // above any tolerance < ~1e-9. A too-tight cost tolerance means
            // the optimizer never declares "no progress" and grinds to
            // `max_iters` even when sitting at the optimum (issue #34).
            let solver = LBFGS::new(line_search, 10)
                .with_tolerance_grad(config.tolerance)?
                .with_tolerance_cost(config.tolerance)?;
            let result = Executor::new(cost_function_lbfgs, solver)
                .configure(|state| {
                    state
                        .param(initial_param.clone())
                        .max_iters(config.max_iterations as u64)
                })
                .run()?;
            (
                result.state().get_best_param().unwrap().clone(),
                result.state().get_cost(),
            )
        }
        Optimizer::LevenbergMarquardt => {
            run_lm_or_lbfgs::<S>(spec, params_per_shape, initial_param, config)?
        }
        Optimizer::CmaEsLm => {
            // Threshold-fired global escape:
            //   1. Run plain LM. If it lands at or below
            //      `config.cmaes_fallback_threshold`, return immediately —
            //      CMA-ES wouldn't beat that and would only burn ~1k–2k
            //      extra evals per restart.
            //   2. Otherwise LM is stuck in a wrong basin (issue92, eulerape,
            //      …). Run bounded CMA-ES → LM polish, and return the
            //      lower-loss of {LM, CMA-ES → LM polish}. Keeping the LM
            //      result around as a guard makes the path strictly
            //      non-regressing if CMA-ES happens to wander into a worse
            //      basin (observed empirically on `issue47_3_set_huge_triple`
            //      before the guard was in place).
            let lm_only = run_lm_or_lbfgs::<S>(spec, params_per_shape, initial_param, config)?;
            if lm_only.1 <= config.cmaes_fallback_threshold {
                lm_only
            } else {
                let (lower, upper, initial_std) =
                    cmaes_bounds_for(initial_param.as_slice(), params_per_shape);
                let cma_cost = DiagramCost::<S> {
                    spec,
                    loss_type: config.loss_type,
                    params_per_shape,
                    // CMA-ES is derivative-free, but evaluating the smooth
                    // surrogate keeps the cost landscape consistent with
                    // the LM polish step that follows.
                    _shape: std::marker::PhantomData,
                };
                let cma_config = crate::fitter::cmaes::CmaEsConfig {
                    lower,
                    upper,
                    initial_mean: initial_param.as_slice().to_vec(),
                    initial_std,
                    // 100 generations × default λ ≈ 1k–2k evals on hard
                    // 6-set ellipse fits. Empirically enough to escape the
                    // wrong-basin issue92_3_set_dropped_pair / eulerape_3_set
                    // cases without dominating wall time when the user has
                    // dialled n_restarts down.
                    max_iters: 100,
                    fn_tol: config.tolerance.max(1e-12),
                    seed: config.seed.wrapping_mul(0x9E37_79B9_7F4A_7C15),
                    lambda: None,
                    max_evals: None,
                };
                let cma_result = crate::fitter::cmaes::minimize(&cma_config, |x| {
                    let p = DVector::from_vec(x.to_vec());
                    match cma_cost.cost(&p) {
                        Ok(c) if c.is_finite() => c,
                        _ => f64::INFINITY,
                    }
                });
                let polish_init = DVector::from_vec(cma_result.best_x);
                // LM polish reuses the existing dispatch so we get the
                // analytical Jacobian for free.
                let cma_lm = run_lm_or_lbfgs::<S>(spec, params_per_shape, &polish_init, config)?;
                if cma_lm.1 < lm_only.1 {
                    cma_lm
                } else {
                    lm_only
                }
            }
        }
    };

    Ok((final_params, loss))
}

/// Run Levenberg-Marquardt for `SumSquared`, falling back to L-BFGS on other
/// smooth losses and to Nelder-Mead on non-smooth ones (`Max*`,
/// `SumAbsolute`, `DiagError`, `SumAbsoluteRegionError`). Non-smooth losses
/// produce zero or discontinuous gradients almost everywhere, which makes
/// L-BFGS thrash against the line search — see issue #45 and
/// [`LossType::is_smooth`].
///
/// Shared between [`Optimizer::LevenbergMarquardt`] and the polish step of
/// [`Optimizer::CmaEsLm`] so both go through the same numerical path.
///
/// [`LossType::is_smooth`]: crate::loss::LossType::is_smooth
fn run_lm_or_lbfgs<S: DiagramShape + Copy + 'static>(
    spec: &PreprocessedSpec,
    params_per_shape: usize,
    initial_param: &DVector<f64>,
    config: &FinalLayoutConfig,
) -> Result<(DVector<f64>, f64), Error> {
    match LmDiagramProblem::<S>::new(spec, params_per_shape, config.loss_type, initial_param) {
        Ok(problem) => {
            // `tolerance` targets the LM `ftol` knob exclusively. `xtol` and
            // `gtol` keep their own fixed `1e-6` defaults so loosening
            // `tolerance` only shortens the cost-converged tail (where
            // ~170× wall-time wins live on slow ellipse specs) without
            // trading off parameter-space or gradient precision. Override
            // either independently via `Fitter::xtol` / `Fitter::gtol`.
            let solver = LevenbergMarquardt::new()
                .with_ftol(config.ftol.unwrap_or(config.tolerance))
                .with_xtol(config.xtol.unwrap_or(1e-6))
                .with_gtol(config.gtol.unwrap_or(1e-6))
                .with_patience(config.max_iterations.max(1));
            let (problem_after, report) = solver.minimize(problem);
            // LM minimises ½·Σrᵢ²; existing loss is Σrᵢ², so ×2.
            let final_loss = report.objective_function * 2.0;
            Ok((problem_after.params.clone(), final_loss))
        }
        Err(_incompatible_loss) if !config.loss_type.is_smooth() => {
            // Non-smooth loss: gradient methods thrash here, so use
            // derivative-free Nelder-Mead instead. The user can switch
            // to a `LossType::Smooth*` variant to take the L-BFGS path
            // through a C¹ surrogate. See issue #45 and `LossType::is_smooth`.
            run_nelder_mead::<S>(spec, params_per_shape, initial_param, config)
        }
        Err(_incompatible_loss) => {
            // Smooth loss (including `Smooth*` surrogate variants). The
            // cost landscape is C¹, so L-BFGS with numerical gradients
            // applies.
            let cost_function_lbfgs = DiagramCost::<S> {
                spec,
                loss_type: config.loss_type,
                params_per_shape,
                _shape: std::marker::PhantomData,
            };
            let line_search = MoreThuenteLineSearch::new();
            let solver = LBFGS::new(line_search, 10)
                .with_tolerance_grad(config.tolerance)?
                .with_tolerance_cost(config.tolerance)?;
            let result = Executor::new(cost_function_lbfgs, solver)
                .configure(|state| {
                    state
                        .param(initial_param.clone())
                        .max_iters(config.max_iterations as u64)
                })
                .run()?;
            Ok((
                result.state().get_best_param().unwrap().clone(),
                result.state().get_cost(),
            ))
        }
    }
}

/// Run derivative-free Nelder-Mead from the given initial parameters.
///
/// Used directly by `Optimizer::NelderMead` and as the non-smooth-loss
/// fallback inside [`run_lm_or_lbfgs`]. Builds the initial simplex by
/// perturbing each parameter by 5% of its magnitude (with shape-aware
/// fallbacks for rotations and near-zero values).
fn run_nelder_mead<S: DiagramShape + Copy + 'static>(
    spec: &PreprocessedSpec,
    params_per_shape: usize,
    initial_param: &DVector<f64>,
    config: &FinalLayoutConfig,
) -> Result<(DVector<f64>, f64), Error> {
    let n_params = initial_param.len();
    let mut simplex = Vec::with_capacity(n_params + 1);
    simplex.push(initial_param.clone());
    for i in 0..n_params {
        let mut perturbed = initial_param.clone();
        let x0_i = initial_param[i];
        let param_idx = i % params_per_shape;
        let is_rotation = params_per_shape == 5 && param_idx == 4;

        let delta = if x0_i.abs() > 1e-10 {
            0.05 * x0_i.abs()
        } else if is_rotation {
            0.2 // ~11.5 degrees for rotation parameters
        } else {
            0.00025
        };
        perturbed[i] += delta;
        simplex.push(perturbed);
    }

    let cost_function = DiagramCost::<S> {
        spec,
        loss_type: config.loss_type,
        params_per_shape,
        // NM doesn't need smoothing for its own sake, but if the caller
        // opted into the surrogate we honour it for cost-landscape
        // consistency with other dispatch arms.
        _shape: std::marker::PhantomData,
    };
    let solver = NelderMead::new(simplex);
    let result = Executor::new(cost_function, solver)
        .configure(|state| state.max_iters(config.max_iterations as u64))
        .run()?;
    Ok((
        result.state().get_best_param().unwrap().clone(),
        result.state().get_cost(),
    ))
}

/// Build per-parameter (lower, upper, initial_std) triples for CMA-ES from
/// an MDS-initialised parameter vector.
///
/// Layout-dependent on the shape:
/// - Circle (`params_per_shape == 3`): `[x, y, r]` per shape. Positions
///   bounded to centroid ± `4 · max(span, max_radius)`; radii bounded to
///   `[1e-6 · max_radius, 5 · max_radius]`. Initial std on x/y is
///   `max(span, max_radius)/2`, on r it's `max_radius/2`.
/// - Ellipse (`params_per_shape == 5`): `[x, y, ln(a), ln(b), angle]`.
///   Same x/y bounds; the semi-axis envelope `[1e-6·max_radius,
///   5·max_radius]` is mapped into log space, so the bound width
///   (~ln(5e6) ≈ 15.4) and the std (~1.54) are scale-invariant. Angle is
///   unbounded with std `π/4` (periodic; hard caps would just force
///   CMA-ES against an artificial wall).
///
/// Other `params_per_shape` values fall back to wide unbounded bounds with
/// std proportional to the parameter magnitude — fine for the rest of the
/// algorithm to run, but new shape kinds should be added here so bounds
/// reflect their geometry.
fn cmaes_bounds_for(
    initial_param: &[f64],
    params_per_shape: usize,
) -> (Vec<f64>, Vec<f64>, Vec<f64>) {
    let n = initial_param.len();
    let n_shapes = n / params_per_shape.max(1);

    // Pull centroid + spread from the position dims (always indices 0, 1
    // within each shape's parameter block, by convention shared across
    // every `DiagramShape` implementation).
    let mut xs = Vec::with_capacity(n_shapes);
    let mut ys = Vec::with_capacity(n_shapes);
    let mut radii = Vec::with_capacity(n_shapes);
    for k in 0..n_shapes {
        let base = k * params_per_shape;
        xs.push(initial_param[base]);
        ys.push(initial_param[base + 1]);
        match params_per_shape {
            3 => radii.push(initial_param[base + 2]),
            5 => {
                // Ellipse semi-axes live in log space (`u = ln(a)`,
                // `v = ln(b)`); exponentiate to get the linear
                // semi-axis magnitudes used for position scale.
                radii.push(initial_param[base + 2].exp());
                radii.push(initial_param[base + 3].exp());
            }
            _ => {}
        }
    }
    let mean_x: f64 = xs.iter().sum::<f64>() / xs.len().max(1) as f64;
    let mean_y: f64 = ys.iter().sum::<f64>() / ys.len().max(1) as f64;
    let max_radius: f64 = radii.iter().cloned().fold(0.0_f64, f64::max).max(1e-6);
    let span_x: f64 = xs
        .iter()
        .map(|x| (x - mean_x).abs())
        .fold(0.0_f64, f64::max);
    let span_y: f64 = ys
        .iter()
        .map(|y| (y - mean_y).abs())
        .fold(0.0_f64, f64::max);
    let pos_scale: f64 = span_x.max(span_y).max(max_radius);
    // Margin: 4× span gives CMA-ES room to globally rearrange shapes
    // without bumping into the bounds before it has a chance to refine.
    let pos_margin: f64 = 4.0 * pos_scale;

    let mut lower = Vec::with_capacity(n);
    let mut upper = Vec::with_capacity(n);
    let mut std_dev = Vec::with_capacity(n);
    for k in 0..n_shapes {
        // x
        lower.push(mean_x - pos_margin);
        upper.push(mean_x + pos_margin);
        std_dev.push((pos_scale * 0.5).max(1e-6));
        // y
        lower.push(mean_y - pos_margin);
        upper.push(mean_y + pos_margin);
        std_dev.push((pos_scale * 0.5).max(1e-6));
        // shape-specific dims
        match params_per_shape {
            3 => {
                // r
                lower.push(1e-6 * max_radius);
                upper.push(5.0 * max_radius);
                std_dev.push((max_radius * 0.5).max(1e-6));
            }
            5 => {
                // u = ln(a), v = ln(b). Bounds map the linear interval
                // `[1e-6·max_radius, 5·max_radius]` (same envelope as
                // before) into log space; the width is constant in log
                // space (~15.4), so the std is independent of scale.
                let u_lower = (1e-6 * max_radius).ln();
                let u_upper = (5.0 * max_radius).ln();
                let log_std = (u_upper - u_lower) * 0.1; // ~1.54
                                                         // u = ln(semi-major)
                lower.push(u_lower);
                upper.push(u_upper);
                std_dev.push(log_std);
                // v = ln(semi-minor)
                lower.push(u_lower);
                upper.push(u_upper);
                std_dev.push(log_std);
                // angle (periodic; no hard caps)
                lower.push(f64::NEG_INFINITY);
                upper.push(f64::INFINITY);
                std_dev.push(std::f64::consts::FRAC_PI_4);
            }
            other => {
                // Unknown shape kind: leave the trailing dims unbounded and
                // pick a generic std from the parameter magnitude. New shape
                // kinds should add an explicit branch above.
                let extra = other.saturating_sub(2);
                let _ = (k, extra); // keep `k` live for future per-shape logic
                for j in 2..other {
                    let v = initial_param[k * other + j];
                    lower.push(f64::NEG_INFINITY);
                    upper.push(f64::INFINITY);
                    std_dev.push(v.abs().max(0.5));
                }
            }
        }
    }
    (lower, upper, std_dev)
}

/// Cost function for region error optimization.
///
/// Computes the discrepancy between target exclusive areas and actual fitted areas.
struct DiagramCost<'a, S: DiagramShape + Copy + 'static> {
    spec: &'a PreprocessedSpec,
    loss_type: crate::loss::LossType,
    params_per_shape: usize,
    _shape: std::marker::PhantomData<S>,
}

impl<S: DiagramShape + Copy + 'static> DiagramCost<'_, S> {
    /// Extract shapes from parameter vector.
    fn params_to_shapes(&self, params: &DVector<f64>) -> Vec<S> {
        let n_sets = self.spec.n_sets;

        (0..n_sets)
            .map(|i| {
                let start = i * self.params_per_shape;
                let end = start + self.params_per_shape;
                S::from_optimizer_params(&params.as_slice()[start..end])
            })
            .collect()
    }
}

impl<S: DiagramShape + Copy + 'static> CostFunction for DiagramCost<'_, S> {
    type Param = DVector<f64>;
    type Output = f64;

    fn cost(&self, param: &Self::Param) -> Result<Self::Output, Error> {
        let shapes = self.params_to_shapes(param);

        // Compute exclusive regions using shape-specific exact computation
        let exclusive_areas = S::compute_exclusive_regions(&shapes);

        // `LossType::compute` evaluates the right thing for both true and
        // smooth-surrogate variants — smoothing is now expressed by picking
        // a `Smooth*` variant rather than via a runtime flag.
        let error = self
            .loss_type
            .compute(&exclusive_areas, &self.spec.exclusive_areas);

        Ok(error)
    }
}

impl<S: DiagramShape + Copy + 'static> Gradient for DiagramCost<'_, S> {
    type Param = DVector<f64>;
    type Gradient = DVector<f64>;

    fn gradient(&self, param: &Self::Param) -> Result<Self::Gradient, Error> {
        // Try the analytical path: requires both the shape (region geometry)
        // and the loss to provide analytical gradients. Falls back to central
        // finite differences when either piece isn't available.
        let shapes = self.params_to_shapes(param);
        if let Some((fitted, fitted_grads)) = S::compute_exclusive_regions_with_gradient(&shapes) {
            if let Some((_loss, loss_grad)) = self
                .loss_type
                .compute_with_gradient(&fitted, &self.spec.exclusive_areas)
            {
                let n_params = param.len();
                let mut grad = vec![0.0; n_params];
                for (mask, &dl_df) in loss_grad.iter() {
                    if let Some(df_dtheta) = fitted_grads.get(mask) {
                        for (k, item) in grad.iter_mut().enumerate().take(n_params) {
                            *item += dl_df * df_dtheta[k];
                        }
                    }
                }
                return Ok(DVector::from_vec(grad));
            }
        }

        // Fallback: central finite differences.
        let param_vec = param.as_slice().to_vec();
        let f = |x: &Vec<f64>| {
            let p = DVector::from_vec(x.to_vec());
            Ok(self.cost(&p).unwrap_or(f64::INFINITY))
        };
        let g_central = vec::central_diff(&f);
        let grad_vec = g_central(&param_vec)?;
        Ok(DVector::from_vec(grad_vec))
    }
}

impl<S: DiagramShape + Copy + 'static> Hessian for DiagramCost<'_, S> {
    type Param = DVector<f64>;
    type Hessian = Vec<Vec<f64>>;

    fn hessian(&self, param: &Self::Param) -> Result<Self::Hessian, Error> {
        // Use central finite differences for numerical Hessian
        // central_hessian expects a gradient function and returns a closure
        let param_vec = param.as_slice().to_vec();

        let grad_fn = |x: &Vec<f64>| -> Result<Vec<f64>, argmin::core::Error> {
            let p = DVector::from_vec(x.to_vec());
            let grad = self.gradient(&p)?;
            Ok(grad.as_slice().to_vec())
        };

        let h_central = finitediff::vec::central_hessian(&grad_fn);
        let hessian = h_central(&param_vec)?;
        Ok(hessian)
    }
}

/// Levenberg-Marquardt least-squares adapter around the diagram cost.
///
/// Decomposes the existing `Σ(fitted_mask − target_mask)²` loss into per-region
/// residuals so LM can build `JᵀJ` directly from the analytical
/// region-area gradients exposed by
/// [`DiagramShape::compute_exclusive_regions_with_gradient`].
///
/// Residual ordering is fixed at construction to all non-empty subset masks
/// `1..(1 << n_sets)`. Masks not produced by the geometry have implicit area 0
/// (zero residual contribution, zero Jacobian row), matching the existing
/// loss exactly.
///
/// Supported loss: [`LossType::SumSquared`]. Other losses are rejected at
/// construction with an [`Error`].
struct LmDiagramProblem<'a, S: DiagramShape + Copy + 'static> {
    spec: &'a PreprocessedSpec,
    params_per_shape: usize,
    /// Fixed canonical residual ordering: all non-empty subset masks.
    masks: Vec<RegionMask>,
    /// `1/sqrt(Σ tᵢ²)`. Stored residuals get multiplied by this so
    /// `Σ rᵢ²` equals the configured loss `Σ(f-t)² / Σt²`.
    norm_factor: f64,
    /// Current parameter vector. Owned so the trait's `params(&self)` can
    /// return a clone.
    params: DVector<f64>,
    /// Lazy cache of `(fitted, fitted_grads)` keyed by the most recent
    /// `set_params` call. Cleared on every `set_params`.
    cache: RefCell<Option<DiagramCache>>,
    _shape: std::marker::PhantomData<S>,
}

struct DiagramCache {
    fitted: HashMap<RegionMask, f64>,
    fitted_grads: HashMap<RegionMask, Vec<f64>>,
}

impl<'a, S: DiagramShape + Copy + 'static> LmDiagramProblem<'a, S> {
    fn new(
        spec: &'a PreprocessedSpec,
        params_per_shape: usize,
        loss_type: LossType,
        initial_param: &DVector<f64>,
    ) -> Result<Self, Error> {
        let norm_factor = match loss_type {
            LossType::SumSquared => {
                let sum_t2: f64 = spec.exclusive_areas.values().map(|&v| v * v).sum();
                if sum_t2 < 1e-20 {
                    return Err(Error::msg(
                        "Levenberg-Marquardt: target area norm is zero, cannot normalise",
                    ));
                }
                1.0 / sum_t2.sqrt()
            }
            other => {
                return Err(Error::msg(format!(
                    "Levenberg-Marquardt only supports SumSquared loss, got {other:?}"
                )));
            }
        };
        // Enumerate every non-empty subset mask once; residual ordering is
        // fixed for the whole solve so the Jacobian shape stays constant
        // (LM rejects residual-count changes).
        let n_sets = spec.n_sets;
        let masks: Vec<RegionMask> = (1..(1usize << n_sets)).collect();
        Ok(Self {
            spec,
            params_per_shape,
            masks,
            norm_factor,
            params: initial_param.clone(),
            cache: RefCell::new(None),
            _shape: std::marker::PhantomData,
        })
    }

    fn ensure_cache(&self) {
        let mut slot = self.cache.borrow_mut();
        if slot.is_some() {
            return;
        }
        let n_sets = self.spec.n_sets;
        let shapes: Vec<S> = (0..n_sets)
            .map(|i| {
                let start = i * self.params_per_shape;
                let end = start + self.params_per_shape;
                S::from_optimizer_params(&self.params.as_slice()[start..end])
            })
            .collect();
        let (fitted, fitted_grads) = match S::compute_exclusive_regions_with_gradient(&shapes) {
            Some(pair) => pair,
            None => {
                // Should not happen for shapes that wire LM in (Circle/Ellipse
                // both implement the gradient path). Fall back to areas-only
                // with empty gradient map; the Jacobian becomes all-zeros
                // and LM will terminate immediately on its orthogonality
                // check, surfacing the issue.
                let fitted = S::compute_exclusive_regions(&shapes);
                (fitted, HashMap::new())
            }
        };
        *slot = Some(DiagramCache {
            fitted,
            fitted_grads,
        });
    }
}

impl<S: DiagramShape + Copy + 'static> LeastSquaresProblem<f64, Dyn, Dyn>
    for LmDiagramProblem<'_, S>
{
    type ResidualStorage = Owned<f64, Dyn>;
    type JacobianStorage = Owned<f64, Dyn, Dyn>;
    type ParameterStorage = Owned<f64, Dyn>;

    fn set_params(&mut self, x: &DVector<f64>) {
        self.params.copy_from(x);
        self.cache.borrow_mut().take();
    }

    fn params(&self) -> DVector<f64> {
        self.params.clone()
    }

    fn residuals(&self) -> Option<DVector<f64>> {
        self.ensure_cache();
        let cache = self.cache.borrow();
        let cache = cache.as_ref().expect("cache populated by ensure_cache");
        let mut residuals = DVector::zeros(self.masks.len());
        for (i, &mask) in self.masks.iter().enumerate() {
            let f = cache.fitted.get(&mask).copied().unwrap_or(0.0);
            let t = self.spec.exclusive_areas.get(&mask).copied().unwrap_or(0.0);
            residuals[i] = self.norm_factor * (f - t);
        }
        Some(residuals)
    }

    fn jacobian(&self) -> Option<DMatrix<f64>> {
        self.ensure_cache();
        let cache = self.cache.borrow();
        let cache = cache.as_ref().expect("cache populated by ensure_cache");
        let n_params = self.params.len();
        let mut jacobian = DMatrix::zeros(self.masks.len(), n_params);
        for (i, &mask) in self.masks.iter().enumerate() {
            if let Some(grad) = cache.fitted_grads.get(&mask) {
                for (j, &g) in grad.iter().enumerate().take(n_params) {
                    jacobian[(i, j)] = self.norm_factor * g;
                }
            }
        }
        Some(jacobian)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::geometry::diagram;
    use crate::geometry::primitives::Point;
    use crate::geometry::shapes::Circle;
    use crate::spec::{DiagramSpec, DiagramSpecBuilder};

    /// Test helper utilities for final layout testing
    mod helpers {
        use super::*;
        use crate::Combination;
        use std::collections::HashMap;

        /// Generate a random diagram specification with the given number of sets.
        ///
        /// This creates random circles, computes their overlaps, and returns
        /// a DiagramSpec that can be used for testing the fitter.
        ///
        /// Returns: (spec, original_circles) for validation
        pub fn generate_random_diagram(n_sets: usize, seed: u64) -> (DiagramSpec, Vec<Circle>) {
            let (circles, set_names) = random_circle_layout(n_sets, seed);
            let exclusive_areas =
                diagram::compute_exclusive_areas_from_layout(&circles, &set_names);
            let spec = create_spec_from_exclusive(exclusive_areas);
            (spec, circles)
        }

        /// Generate a random circle layout for testing
        pub fn random_circle_layout(n_sets: usize, seed: u64) -> (Vec<Circle>, Vec<String>) {
            use rand::Rng;
            use rand::SeedableRng;

            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);

            let set_names: Vec<String> = (0..n_sets).map(|i| format!("Set{}", i)).collect();

            let mut circles = Vec::new();

            for _ in 0..n_sets {
                // Random position in [-5, 5] x [-5, 5]
                let x = rng.random_range(-5.0..5.0);
                let y = rng.random_range(-5.0..5.0);
                // Random radius in [0.5, 2.0]
                let r = rng.random_range(0.5..2.0);

                circles.push(Circle::new(Point::new(x, y), r));
            }

            (circles, set_names)
        }

        /// Create a DiagramSpec from exclusive areas
        pub fn create_spec_from_exclusive(
            exclusive_areas: HashMap<Combination, f64>,
        ) -> DiagramSpec {
            let mut builder = DiagramSpecBuilder::new();

            // Add single sets in sorted order so set-to-bit index assignment is
            // deterministic regardless of HashMap iteration order.
            let mut singles: Vec<_> = exclusive_areas
                .iter()
                .filter(|(combo, _)| combo.sets().len() == 1)
                .collect();
            singles.sort_by_key(|(combo, _)| combo.sets()[0].clone());
            for (combo, &area) in &singles {
                builder = builder.set(combo.sets()[0].as_str(), area);
            }

            // Add all intersections (order doesn't affect bit assignment)
            for (combo, &area) in &exclusive_areas {
                if combo.sets().len() > 1 {
                    let sets: Vec<&str> = combo.sets().iter().map(|s| s.as_str()).collect();
                    builder = builder.intersection(&sets, area);
                }
            }

            builder.build().unwrap()
        }
    }

    // ========== Basic Functionality Tests ==========

    #[test]
    fn test_optimize_layout_simple() {
        let spec = DiagramSpecBuilder::new()
            .set("A", 3.0)
            .set("B", 4.0)
            .intersection(&["A", "B"], 1.0)
            .build()
            .unwrap();

        let preprocessed = spec.preprocess().unwrap();

        // Start with circles that are too far apart
        let positions = vec![0.0, 0.0, 5.0, 0.0];
        let radii = vec![
            (3.0 / std::f64::consts::PI).sqrt(),
            (4.0 / std::f64::consts::PI).sqrt(),
        ];

        let config = FinalLayoutConfig {
            optimizer: Optimizer::NelderMead,
            loss_type: crate::loss::LossType::sse(),
            max_iterations: 50,
            tolerance: 1e-4,
            seed: 0,
            n_restarts: 1,
            cmaes_fallback_threshold: 1e-3,
            xtol: None,
            ftol: None,
            gtol: None,
        };

        let result = optimize_layout::<Circle>(&preprocessed, &positions, &radii, config);
        assert!(result.is_ok());

        let (final_params, loss) = result.unwrap();
        // For circles, params are [x0, y0, r0, x1, y1, r1, ...]
        assert_eq!(final_params.len(), 2 * Circle::n_params()); // 2 circles * 3 params each
        assert!(loss >= 0.0);
    }

    #[test]
    fn test_cost_function_computes() {
        let spec = DiagramSpecBuilder::new()
            .set("A", 5.0)
            .set("B", 5.0)
            .intersection(&["A", "B"], 2.0)
            .build()
            .unwrap();

        let preprocessed = spec.preprocess().unwrap();

        let cost_fn = DiagramCost::<Circle> {
            spec: &preprocessed,
            loss_type: crate::loss::LossType::sse(),
            params_per_shape: Circle::n_params(),
            _shape: std::marker::PhantomData,
        };

        // Create parameter vector
        let positions = vec![0.0, 0.0, 2.0, 0.0];
        let radii = vec![
            (5.0 / std::f64::consts::PI).sqrt(),
            (5.0 / std::f64::consts::PI).sqrt(),
        ];

        let mut params = positions.clone();
        params.extend_from_slice(&radii);
        let param_vec = DVector::from_vec(params);

        let result = cost_fn.cost(&param_vec);
        assert!(result.is_ok(), "Cost function should compute successfully");

        let error = result.unwrap();
        assert!(error >= 0.0, "Error should be non-negative");
    }

    // ========== Layout Reproduction Tests ==========

    #[test]
    fn test_reproduce_simple_two_circle_layout() {
        use helpers::*;

        // Create a simple known layout
        let c1 = Circle::new(Point::new(0.0, 0.0), 1.0);
        let c2 = Circle::new(Point::new(1.5, 0.0), 1.0);
        let circles = vec![c1, c2];
        let set_names = vec!["A".to_string(), "B".to_string()];

        // Compute exclusive areas from this layout
        let exclusive = diagram::compute_exclusive_areas_from_layout(&circles, &set_names);

        // Create spec from these areas
        let spec = create_spec_from_exclusive(exclusive);
        let preprocessed = spec.preprocess().unwrap();

        // Try to fit with initial guess close to original
        let positions = vec![0.0, 0.0, 1.5, 0.0];
        let radii = vec![1.0, 1.0];

        let config = FinalLayoutConfig {
            optimizer: Optimizer::NelderMead,
            loss_type: crate::loss::LossType::sse(),
            max_iterations: 100,
            tolerance: 1e-6,
            seed: 0,
            n_restarts: 1,
            cmaes_fallback_threshold: 1e-3,
            xtol: None,
            ftol: None,
            gtol: None,
        };

        let result = optimize_layout::<Circle>(&preprocessed, &positions, &radii, config);
        assert!(result.is_ok());

        let (_, loss) = result.unwrap();

        // Should be able to reproduce the layout with very low error
        assert!(
            loss < 1e-2,
            "Should reproduce layout with low error, got: {}",
            loss
        );
    }

    #[test]
    fn test_reproduce_random_three_circle_layout() {
        use helpers::*;

        // Generate random layout
        let (circles, set_names) = random_circle_layout(3, 42);

        // Compute exclusive areas
        let exclusive_areas = diagram::compute_exclusive_areas_from_layout(&circles, &set_names);

        // Create spec
        let spec = create_spec_from_exclusive(exclusive_areas);
        let preprocessed = spec.preprocess().unwrap();

        // Extract initial positions and radii
        let mut positions = Vec::new();
        let mut radii = Vec::new();
        for c in &circles {
            positions.push(c.center().x());
            positions.push(c.center().y());
            radii.push(c.radius());
        }

        let config = FinalLayoutConfig {
            optimizer: Optimizer::NelderMead,
            loss_type: crate::loss::LossType::sse(),
            max_iterations: 200,
            tolerance: 1e-6,
            seed: 0,
            n_restarts: 1,
            cmaes_fallback_threshold: 1e-3,
            xtol: None,
            ftol: None,
            gtol: None,
        };

        let result = optimize_layout::<Circle>(&preprocessed, &positions, &radii, config);
        assert!(result.is_ok());

        let (_final_pos, loss) = result.unwrap();

        // Should be able to reproduce with reasonable error
        // Note: 3-way intersections are harder, and optimizer may converge to local minima
        // Relaxed tolerance to account for optimization challenges
        assert!(
            loss < 5.0,
            "Should reproduce random layout reasonably, got: {}",
            loss
        );
    }

    #[test]
    fn test_reproduce_multiple_random_diagrams() {
        use helpers::*;

        // Test multiple random configurations. Seeds picked so Nelder-Mead
        // converges within the per-arity tolerances on all supported
        // platforms; seed 500 for n=4 was swapped after observed divergence on
        // macOS CI (loss ~57 vs. tolerance 10). If you bump seeds, re-run on
        // every CI target before committing.
        let test_configs = [
            (2, 100), // 2 circles
            (2, 200), // 2 circles
            (3, 300), // 3 circles
            (3, 400), // 3 circles
            (4, 501), // 4 circles
        ];

        let mut results = Vec::new();

        for (i, &(n_sets, seed)) in test_configs.iter().enumerate() {
            // Generate random diagram
            let (spec, original_circles) = generate_random_diagram(n_sets, seed);
            let preprocessed = spec.preprocess().unwrap();

            // Extract initial positions and radii from original circles
            let mut positions = Vec::new();
            let mut radii = Vec::new();
            for c in &original_circles {
                positions.push(c.center().x());
                positions.push(c.center().y());
                radii.push(c.radius());
            }

            let config = FinalLayoutConfig {
                optimizer: Optimizer::NelderMead,
                loss_type: crate::loss::LossType::sse(),
                max_iterations: 200,
                tolerance: 1e-6,
                seed: 0,
                n_restarts: 1,
                cmaes_fallback_threshold: 1e-3,
                xtol: None,
                ftol: None,
                gtol: None,
            };

            let result = optimize_layout::<Circle>(&preprocessed, &positions, &radii, config);
            assert!(result.is_ok(), "Optimization failed for config {}", i);

            let (_, loss) = result.unwrap();

            results.push((n_sets, seed, loss));
        }

        // Relaxed tolerances - the algorithm should do reasonably well
        // but we're starting from the exact solution, so it should converge.
        // On failure, report the exact (n_sets, seed, loss) so CI breakage is
        // diagnosable without having to re-run locally.
        for &(n_sets, seed, loss) in &results {
            let tol: f64 = match n_sets {
                2 => 2.0,  // 2-way: should be very good
                3 => 5.0,  // 3-way: harder but doable
                4 => 10.0, // 4-way: quite difficult
                _ => 20.0, // Higher order: very challenging
            };
            assert!(
                loss < tol,
                "configuration n_sets={}, seed={} produced loss={:.6}, \
                 exceeds tolerance {:.1} — optimizer may have regressed, \
                 or the seed lands in a bad basin on this platform",
                n_sets,
                seed,
                loss,
                tol
            );
        }
    }

    /// Compare the analytical gradient produced by `DiagramCost::gradient` to
    /// the central-difference gradient of the cost function. Asserts a relative
    /// error below `tol`. This is the primary correctness check for the
    /// analytical-gradient path: equality (up to FD noise) on smooth interior
    /// configurations.
    fn assert_analytic_matches_fd<F>(
        spec: &PreprocessedSpec,
        params: &[f64],
        loss_type: crate::loss::LossType,
        h: f64,
        tol: f64,
        label: F,
    ) where
        F: FnOnce() -> String,
    {
        let cost = DiagramCost::<Circle> {
            spec,
            loss_type,
            params_per_shape: Circle::n_params(),
            _shape: std::marker::PhantomData,
        };
        let p = DVector::from_vec(params.to_vec());

        let analytic = cost.gradient(&p).expect("analytic gradient");

        // Central differences with explicit step size.
        let n = params.len();
        let mut fd = vec![0.0; n];
        for i in 0..n {
            let mut plus = params.to_vec();
            let mut minus = params.to_vec();
            plus[i] += h;
            minus[i] -= h;
            let f_plus = cost.cost(&DVector::from_vec(plus)).expect("cost +");
            let f_minus = cost.cost(&DVector::from_vec(minus)).expect("cost -");
            fd[i] = (f_plus - f_minus) / (2.0 * h);
        }

        let analytic_slice: Vec<f64> = analytic.as_slice().to_vec();
        let diff_norm: f64 = analytic_slice
            .iter()
            .zip(fd.iter())
            .map(|(a, b)| (a - b).powi(2))
            .sum::<f64>()
            .sqrt();
        let fd_norm: f64 = fd.iter().map(|b| b * b).sum::<f64>().sqrt();
        let rel = if fd_norm > 1e-12 {
            diff_norm / fd_norm
        } else {
            diff_norm
        };
        assert!(
            rel < tol,
            "{}: analytic vs FD gradient mismatch (rel={:.3e}, |fd|={:.3e})\n  analytic={:?}\n  fd      ={:?}",
            label(),
            rel,
            fd_norm,
            analytic_slice,
            fd
        );
    }

    /// Build a PreprocessedSpec from a known circle layout — the target areas
    /// are simply the layout's own exclusive areas, so the gradient at the
    /// layout's own params is well-defined and the loss at that point is zero.
    /// Perturbing the params lets us probe the gradient on the smooth interior
    /// without sitting at the optimum.
    fn spec_from_circles(circles: &[Circle]) -> PreprocessedSpec {
        use helpers::create_spec_from_exclusive;
        let names: Vec<String> = (0..circles.len()).map(|i| format!("S{}", i)).collect();
        let exclusive = diagram::compute_exclusive_areas_from_layout(circles, &names);
        create_spec_from_exclusive(exclusive).preprocess().unwrap()
    }

    fn flat_params(circles: &[Circle]) -> Vec<f64> {
        let mut v = Vec::with_capacity(3 * circles.len());
        for c in circles {
            v.push(c.center().x());
            v.push(c.center().y());
            v.push(c.radius());
        }
        v
    }

    #[test]
    fn analytic_gradient_two_circle_overlap() {
        // Classic 2-circle lens: this is the case I derived analytically.
        let circles = vec![
            Circle::new(Point::new(0.0, 0.0), 1.0),
            Circle::new(Point::new(1.0, 0.0), 1.0),
        ];
        let spec = spec_from_circles(&circles);
        // Probe at a perturbed config so the loss isn't exactly zero
        // (gradient at L=0 minimum is also zero, which is uninformative).
        let params = vec![0.0, 0.0, 1.0, 1.2, 0.0, 0.95];
        assert_analytic_matches_fd(
            &spec,
            &params,
            crate::loss::LossType::SumSquared,
            1e-6,
            1e-4,
            || "two_circle_overlap SumSquared".to_string(),
        );
    }

    #[test]
    fn analytic_gradient_three_circle_venn() {
        // Equilateral 3-Venn — exercises the polygon-boundary path.
        let circles = vec![
            Circle::new(Point::new(0.0, 0.0), 1.0),
            Circle::new(Point::new(1.0, 0.0), 1.0),
            Circle::new(Point::new(0.5, 0.866), 1.0),
        ];
        let spec = spec_from_circles(&circles);
        let mut params = flat_params(&circles);
        // perturb to leave the optimum and avoid trivial zero gradient.
        params[0] += 0.07;
        params[4] -= 0.05;
        params[8] += 0.03;
        assert_analytic_matches_fd(
            &spec,
            &params,
            crate::loss::LossType::SumSquared,
            1e-6,
            1e-3,
            || "three_circle_venn SumSquared".to_string(),
        );
    }

    #[test]
    fn analytic_gradient_disjoint() {
        // All-disjoint: exclusive areas are just the singletons; no
        // intersection-region gradients. Probes the n=1 boundary case.
        let circles = vec![
            Circle::new(Point::new(0.0, 0.0), 1.0),
            Circle::new(Point::new(5.0, 0.0), 1.2),
            Circle::new(Point::new(2.5, 5.0), 0.8),
        ];
        let spec = spec_from_circles(&circles);
        let params = flat_params(&circles);
        // Perturb radii (positions don't move the loss when shapes stay
        // disjoint, so the gradient on x/y is zero — trivially matches).
        let mut perturbed = params.clone();
        perturbed[2] += 0.1;
        perturbed[5] -= 0.05;
        perturbed[8] += 0.07;
        assert_analytic_matches_fd(
            &spec,
            &perturbed,
            crate::loss::LossType::SumSquared,
            1e-6,
            1e-4,
            || "disjoint".to_string(),
        );
    }

    #[test]
    fn analytic_gradient_nested() {
        // Containment: small circle entirely inside a larger one.
        let circles = vec![
            Circle::new(Point::new(0.0, 0.0), 3.0),
            Circle::new(Point::new(0.5, 0.2), 0.8),
        ];
        let spec = spec_from_circles(&circles);
        let mut params = flat_params(&circles);
        // Perturb the inner circle.
        params[3] += 0.1;
        params[5] += 0.05;
        assert_analytic_matches_fd(
            &spec,
            &params,
            crate::loss::LossType::SumSquared,
            1e-6,
            1e-4,
            || "nested".to_string(),
        );
    }

    /// Same idea as `benchmark_gradient_call_only` but for ellipses.
    #[test]
    #[ignore]
    fn benchmark_gradient_call_only_ellipse() {
        use crate::geometry::shapes::Ellipse;
        use crate::loss::LossType;
        use rand::{Rng, SeedableRng};
        use std::time::Instant;
        let cases: &[(usize, u64, usize)] = &[(3, 13, 500), (5, 100, 200), (8, 200, 100)];
        for &(n, seed, iters) in cases {
            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
            let ellipses: Vec<Ellipse> = (0..n)
                .map(|_| {
                    Ellipse::new(
                        Point::new(rng.random_range(-2.5..2.5), rng.random_range(-2.5..2.5)),
                        rng.random_range(0.7..1.6),
                        rng.random_range(0.5..1.0),
                        rng.random_range(-0.5..0.5),
                    )
                })
                .collect();
            let spec = spec_from_ellipses(&ellipses);
            let mut params = flat_params_ellipse(&ellipses);
            for v in &mut params {
                *v += rng.random_range(-0.04..0.04);
            }
            let p = DVector::from_vec(params.clone());
            let cost = DiagramCost::<Ellipse> {
                spec: &spec,
                loss_type: LossType::SumSquared,
                params_per_shape: Ellipse::n_params(),
                _shape: std::marker::PhantomData,
            };
            let t0 = Instant::now();
            for _ in 0..iters {
                let _ = cost.gradient(&p).unwrap();
            }
            let dt_an = t0.elapsed();
            let t0 = Instant::now();
            for _ in 0..iters {
                let pv = p.as_slice().to_vec();
                let f = |x: &Vec<f64>| {
                    let pp = DVector::from_vec(x.to_vec());
                    Ok(cost.cost(&pp).unwrap_or(f64::INFINITY))
                };
                let g = vec::central_diff(&f);
                let _ = g(&pv).unwrap();
            }
            let dt_fd = t0.elapsed();
            eprintln!(
                "ellipse n={} ({} grads): analytic={:>7.2}ms ({:>5.1}us/call) | FD={:>7.2}ms ({:>5.1}us/call) | speedup={:.1}x",
                n,
                iters,
                dt_an.as_secs_f64() * 1000.0,
                dt_an.as_secs_f64() * 1e6 / iters as f64,
                dt_fd.as_secs_f64() * 1000.0,
                dt_fd.as_secs_f64() * 1e6 / iters as f64,
                dt_fd.as_secs_f64() / dt_an.as_secs_f64(),
            );
        }
    }

    /// Apples-to-apples: time the gradient function alone, with and without
    /// the analytical path, on the same loss. The FD comparison comes from
    /// running the same `DiagramCost::gradient` body but skipping the
    /// analytic short-circuit by using `finitediff::vec::central_diff`
    /// directly — what the old implementation did.
    #[test]
    #[ignore]
    fn benchmark_gradient_call_only() {
        use crate::loss::LossType;
        use rand::{Rng, SeedableRng};
        use std::time::Instant;

        let cases: &[(usize, u64, usize)] = &[(3, 11, 1000), (5, 33, 500), (8, 55, 200)];
        for &(n, seed, iters) in cases {
            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
            let circles: Vec<Circle> = (0..n)
                .map(|_| {
                    Circle::new(
                        Point::new(rng.random_range(-3.0..3.0), rng.random_range(-3.0..3.0)),
                        rng.random_range(0.6..1.8),
                    )
                })
                .collect();
            let spec = spec_from_circles(&circles);
            let mut params = flat_params(&circles);
            for v in &mut params {
                *v += rng.random_range(-0.05..0.05);
            }
            let p = DVector::from_vec(params.clone());

            let cost = DiagramCost::<Circle> {
                spec: &spec,
                loss_type: LossType::SumSquared,
                params_per_shape: Circle::n_params(),
                _shape: std::marker::PhantomData,
            };

            // Analytical (current default for SumSquared + Circle).
            let t0 = Instant::now();
            for _ in 0..iters {
                let _ = cost.gradient(&p).unwrap();
            }
            let dt_an = t0.elapsed();

            // FD baseline: invoke the central_diff path directly (same as the
            // pre-analytic implementation).
            let cost_fd = || -> f64 {
                let dvec = p.clone();
                cost.cost(&dvec).unwrap()
            };
            let _warm = cost_fd();
            let t0 = Instant::now();
            for _ in 0..iters {
                let pv = p.as_slice().to_vec();
                let f = |x: &Vec<f64>| {
                    let pp = DVector::from_vec(x.to_vec());
                    Ok(cost.cost(&pp).unwrap_or(f64::INFINITY))
                };
                let g = vec::central_diff(&f);
                let _ = g(&pv).unwrap();
            }
            let dt_fd = t0.elapsed();

            eprintln!(
                "n={} ({} grads): analytic={:>7.2}ms ({:>5.1}us/call) | FD={:>7.2}ms ({:>5.1}us/call) | speedup={:.1}x",
                n,
                iters,
                dt_an.as_secs_f64() * 1000.0,
                dt_an.as_secs_f64() * 1e6 / iters as f64,
                dt_fd.as_secs_f64() * 1000.0,
                dt_fd.as_secs_f64() * 1e6 / iters as f64,
                dt_fd.as_secs_f64() / dt_an.as_secs_f64(),
            );
        }
    }

    /// Build an ellipse-only spec from a known layout, parallel to
    /// `spec_from_circles`.
    fn spec_from_ellipses(ellipses: &[crate::geometry::shapes::Ellipse]) -> PreprocessedSpec {
        use crate::geometry::traits::Area;
        use crate::Combination;
        use std::collections::HashMap;
        // Use the new boundary-based area for both target generation and
        // the cost-function side, so the FD baseline and analytical gradient
        // are consistent at the smooth interior.
        let exclusive =
            crate::geometry::shapes::ellipse::Ellipse::compute_exclusive_regions(ellipses);
        // Convert mask->area map into Combination-keyed map.
        let names: Vec<String> = (0..ellipses.len()).map(|i| format!("S{}", i)).collect();
        let mut combos: HashMap<Combination, f64> = HashMap::new();
        for (mask, area) in exclusive {
            if area > 0.0 {
                let sets: Vec<&str> = (0..ellipses.len())
                    .filter(|&i| (mask & (1 << i)) != 0)
                    .map(|i| names[i].as_str())
                    .collect();
                if !sets.is_empty() {
                    combos.insert(Combination::new(&sets), area);
                }
            }
        }
        // Even if some areas were 0, ensure singletons are present in the
        // builder so set indices line up.
        let single_areas: HashMap<usize, f64> = (0..ellipses.len())
            .map(|i| (i, ellipses[i].area()))
            .collect();
        let mut builder = crate::spec::DiagramSpecBuilder::new();
        for i in 0..ellipses.len() {
            let area = combos
                .get(&Combination::new(&[names[i].as_str()]))
                .copied()
                .unwrap_or(single_areas[&i]);
            builder = builder.set(names[i].as_str(), area);
        }
        for (combo, area) in &combos {
            if combo.sets().len() > 1 {
                let sets: Vec<&str> = combo.sets().iter().map(|s| s.as_str()).collect();
                builder = builder.intersection(&sets, *area);
            }
        }
        builder.build().unwrap().preprocess().unwrap()
    }

    fn flat_params_ellipse(ellipses: &[crate::geometry::shapes::Ellipse]) -> Vec<f64> {
        let mut v = Vec::with_capacity(5 * ellipses.len());
        for e in ellipses {
            v.push(e.center().x());
            v.push(e.center().y());
            v.push(e.semi_major());
            v.push(e.semi_minor());
            v.push(e.rotation());
        }
        v
    }

    /// Generic cost-function FD-vs-analytic comparison parameterised over the
    /// shape type. Mirrors `assert_analytic_matches_fd` for circles.
    fn assert_analytic_matches_fd_shape<S, F>(
        spec: &PreprocessedSpec,
        params: &[f64],
        loss_type: crate::loss::LossType,
        h: f64,
        tol: f64,
        label: F,
    ) where
        S: DiagramShape + Copy + 'static,
        F: FnOnce() -> String,
    {
        let cost = DiagramCost::<S> {
            spec,
            loss_type,
            params_per_shape: S::n_params(),
            _shape: std::marker::PhantomData,
        };
        let p = DVector::from_vec(params.to_vec());
        let analytic = cost.gradient(&p).expect("analytic gradient");
        let n = params.len();
        let mut fd = vec![0.0; n];
        for i in 0..n {
            let mut plus = params.to_vec();
            let mut minus = params.to_vec();
            plus[i] += h;
            minus[i] -= h;
            let f_plus = cost.cost(&DVector::from_vec(plus)).expect("cost +");
            let f_minus = cost.cost(&DVector::from_vec(minus)).expect("cost -");
            fd[i] = (f_plus - f_minus) / (2.0 * h);
        }
        let analytic_slice: Vec<f64> = analytic.as_slice().to_vec();
        let diff_norm: f64 = analytic_slice
            .iter()
            .zip(fd.iter())
            .map(|(a, b)| (a - b).powi(2))
            .sum::<f64>()
            .sqrt();
        let fd_norm: f64 = fd.iter().map(|b| b * b).sum::<f64>().sqrt();
        let rel = if fd_norm > 1e-12 {
            diff_norm / fd_norm
        } else {
            diff_norm
        };
        assert!(
            rel < tol,
            "{}: analytic vs FD gradient mismatch (rel={:.3e}, |fd|={:.3e})\n  analytic={:?}\n  fd      ={:?}",
            label(),
            rel,
            fd_norm,
            analytic_slice,
            fd
        );
    }

    #[test]
    fn analytic_gradient_ellipse_two_overlap() {
        use crate::geometry::shapes::Ellipse;
        let ellipses = vec![
            Ellipse::new(Point::new(0.0, 0.0), 1.0, 0.6, 0.0),
            Ellipse::new(Point::new(1.0, 0.2), 1.2, 0.5, 0.4),
        ];
        let spec = spec_from_ellipses(&ellipses);
        let mut params = flat_params_ellipse(&ellipses);
        params[0] += 0.05;
        params[4] += 0.07;
        assert_analytic_matches_fd_shape::<Ellipse, _>(
            &spec,
            &params,
            crate::loss::LossType::SumSquared,
            1e-6,
            5e-4,
            || "ellipse two_overlap SumSquared".to_string(),
        );
    }

    #[test]
    fn analytic_gradient_ellipse_three_venn() {
        use crate::geometry::shapes::Ellipse;
        let ellipses = vec![
            Ellipse::new(Point::new(0.0, 0.0), 1.2, 0.7, 0.0),
            Ellipse::new(Point::new(1.0, 0.0), 1.1, 0.65, 0.3),
            Ellipse::new(Point::new(0.5, 0.866), 1.0, 0.8, -0.2),
        ];
        let spec = spec_from_ellipses(&ellipses);
        let mut params = flat_params_ellipse(&ellipses);
        params[0] += 0.07;
        params[4] -= 0.05;
        params[8] += 0.03;
        params[14] += 0.04;
        assert_analytic_matches_fd_shape::<Ellipse, _>(
            &spec,
            &params,
            crate::loss::LossType::SumSquared,
            1e-6,
            5e-3,
            || "ellipse three_venn".to_string(),
        );
    }

    #[test]
    fn analytic_gradient_ellipse_disjoint() {
        use crate::geometry::shapes::Ellipse;
        let ellipses = vec![
            Ellipse::new(Point::new(0.0, 0.0), 1.0, 0.6, 0.0),
            Ellipse::new(Point::new(5.0, 0.0), 1.2, 0.5, 0.5),
            Ellipse::new(Point::new(2.5, 5.0), 0.8, 0.4, -0.3),
        ];
        let spec = spec_from_ellipses(&ellipses);
        let mut params = flat_params_ellipse(&ellipses);
        params[2] += 0.1;
        params[3] -= 0.05;
        params[4] += 0.07;
        assert_analytic_matches_fd_shape::<Ellipse, _>(
            &spec,
            &params,
            crate::loss::LossType::SumSquared,
            1e-6,
            5e-4,
            || "ellipse disjoint".to_string(),
        );
    }

    #[test]
    fn analytic_gradient_ellipse_nested() {
        use crate::geometry::shapes::Ellipse;
        let ellipses = vec![
            Ellipse::new(Point::new(0.0, 0.0), 3.0, 2.5, 0.1),
            Ellipse::new(Point::new(0.5, 0.2), 0.8, 0.5, 0.3),
        ];
        let spec = spec_from_ellipses(&ellipses);
        let mut params = flat_params_ellipse(&ellipses);
        // Perturb the inner ellipse only — outer stays around inner.
        params[5] += 0.1;
        params[7] += 0.05;
        params[9] += 0.05;
        assert_analytic_matches_fd_shape::<Ellipse, _>(
            &spec,
            &params,
            crate::loss::LossType::SumSquared,
            1e-6,
            5e-4,
            || "ellipse nested".to_string(),
        );
    }

    #[test]
    fn analytic_gradient_ellipse_random_layouts() {
        use crate::geometry::shapes::Ellipse;
        use rand::Rng;
        use rand::SeedableRng;
        let configs: &[(usize, u64)] = &[(2, 7), (3, 13), (3, 21), (5, 100)];
        for &(n, seed) in configs {
            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
            let ellipses: Vec<Ellipse> = (0..n)
                .map(|_| {
                    Ellipse::new(
                        Point::new(rng.random_range(-2.5..2.5), rng.random_range(-2.5..2.5)),
                        rng.random_range(0.7..1.6),
                        rng.random_range(0.5..1.0),
                        rng.random_range(-0.5..0.5),
                    )
                })
                .collect();
            let spec = spec_from_ellipses(&ellipses);
            let mut params = flat_params_ellipse(&ellipses);
            for v in &mut params {
                *v += rng.random_range(-0.04..0.04);
            }
            assert_analytic_matches_fd_shape::<Ellipse, _>(
                &spec,
                &params,
                crate::loss::LossType::SumSquared,
                1e-6,
                1e-2,
                || format!("ellipse random n={}, seed={}", n, seed),
            );
        }
    }

    /// Eyeball benchmark: time L-BFGS fits with the analytical gradient
    /// against the finite-difference fallback. This is an `#[ignore]`-gated
    /// observation, not an asserted regression — wall time varies across
    /// machines, but on a typical workstation the analytical path should be
    /// noticeably faster on 5- and 8-set fits and produce equivalent loss.
    #[test]
    #[ignore]
    fn benchmark_lbfgs_analytic_vs_fd() {
        use crate::loss::LossType;
        use std::time::Instant;
        // Disable analytical gradient for the FD baseline by pretending the
        // shape doesn't support it — we do this by routing through a wrapper
        // cost that always falls through to FD. The simplest implementation:
        // run the optimiser with a loss type that doesn't expose an analytic
        // gradient (e.g. SumAbsoute), and compare against the same fit using
        // SumSquared (which DOES). This isn't apples-to-apples on
        // loss surface but does exercise the gradient path. For a more direct
        // test, see comments below.
        let configs: &[(usize, u64)] = &[(5, 33), (5, 44), (8, 55)];
        for &(n, seed) in configs {
            use rand::Rng;
            use rand::SeedableRng;
            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
            let circles: Vec<Circle> = (0..n)
                .map(|_| {
                    Circle::new(
                        Point::new(rng.random_range(-3.0..3.0), rng.random_range(-3.0..3.0)),
                        rng.random_range(0.6..1.8),
                    )
                })
                .collect();
            let spec = spec_from_circles(&circles);
            let positions: Vec<f64> = circles
                .iter()
                .flat_map(|c| [c.center().x(), c.center().y()])
                .collect();
            let radii: Vec<f64> = circles.iter().map(|c| c.radius()).collect();

            // Analytical-gradient path: SumSquared has compute_with_gradient.
            let cfg_an = FinalLayoutConfig {
                optimizer: Optimizer::Lbfgs,
                loss_type: LossType::SumSquared,
                max_iterations: 200,
                tolerance: 1e-6,
                seed: 0,
                n_restarts: 1,
                cmaes_fallback_threshold: 1e-3,
                xtol: None,
                ftol: None,
                gtol: None,
            };
            let t0 = Instant::now();
            let (_p_an, loss_an) =
                optimize_layout::<Circle>(&spec, &positions, &radii, cfg_an).expect("analytic fit");
            let dt_an = t0.elapsed();

            // FD path: SumAbsoute has no compute_with_gradient → falls back to FD.
            // (Same optimiser, different loss; serves as a relative timing reference.)
            let cfg_fd = FinalLayoutConfig {
                optimizer: Optimizer::Lbfgs,
                loss_type: LossType::SumAbsoute,
                max_iterations: 200,
                tolerance: 1e-6,
                seed: 0,
                n_restarts: 1,
                cmaes_fallback_threshold: 1e-3,
                xtol: None,
                ftol: None,
                gtol: None,
            };
            let t0 = Instant::now();
            let (_p_fd, loss_fd) =
                optimize_layout::<Circle>(&spec, &positions, &radii, cfg_fd).expect("fd fit");
            let dt_fd = t0.elapsed();

            eprintln!(
                "n={} seed={}: analytic={:>6.1}ms loss={:.4e} | fd-loss={:>6.1}ms loss={:.4e}",
                n,
                seed,
                dt_an.as_secs_f64() * 1000.0,
                loss_an,
                dt_fd.as_secs_f64() * 1000.0,
                loss_fd
            );
        }
    }

    #[test]
    fn analytic_gradient_random_layouts() {
        use rand::Rng;
        use rand::SeedableRng;
        // Several seeds × sizes, spanning generic 3-, 5-, 8-set fits.
        let configs: &[(usize, u64)] = &[(3, 11), (3, 22), (5, 33), (5, 44), (8, 55)];
        for &(n, seed) in configs {
            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
            let circles: Vec<Circle> = (0..n)
                .map(|_| {
                    Circle::new(
                        Point::new(rng.random_range(-3.0..3.0), rng.random_range(-3.0..3.0)),
                        rng.random_range(0.6..1.8),
                    )
                })
                .collect();
            let spec = spec_from_circles(&circles);
            // Perturb params to leave the (target = layout) optimum.
            let mut params = flat_params(&circles);
            for v in &mut params {
                *v += rng.random_range(-0.05..0.05);
            }
            assert_analytic_matches_fd(
                &spec,
                &params,
                crate::loss::LossType::SumSquared,
                1e-6,
                5e-3,
                || format!("random n={}, seed={}", n, seed),
            );
        }
    }

    #[test]
    fn lm_jacobian_consistent_with_diagram_gradient() {
        // For sum-of-squares losses, ∇L(θ) = 2·Jᵀ·r where J is the
        // residual Jacobian and r the residual vector. Verifies the LM
        // adapter constructs J consistently with `DiagramCost::gradient`.
        use rand::Rng;
        use rand::SeedableRng;
        let configs: &[(usize, u64, crate::loss::LossType)] = &[
            (3, 11, crate::loss::LossType::SumSquared),
            (5, 33, crate::loss::LossType::SumSquared),
        ];
        for &(n, seed, loss_type) in configs {
            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
            let circles: Vec<Circle> = (0..n)
                .map(|_| {
                    Circle::new(
                        Point::new(rng.random_range(-3.0..3.0), rng.random_range(-3.0..3.0)),
                        rng.random_range(0.6..1.8),
                    )
                })
                .collect();
            let spec = spec_from_circles(&circles);
            let mut params = flat_params(&circles);
            for v in &mut params {
                *v += rng.random_range(-0.05..0.05);
            }
            let param_dvec = DVector::from_vec(params.clone());

            let cost = DiagramCost::<Circle> {
                spec: &spec,
                loss_type,
                params_per_shape: Circle::n_params(),
                _shape: std::marker::PhantomData,
            };
            let analytic_grad = cost.gradient(&param_dvec).unwrap();

            let mut problem =
                LmDiagramProblem::<Circle>::new(&spec, Circle::n_params(), loss_type, &param_dvec)
                    .unwrap();
            problem.set_params(&param_dvec);
            let r = problem.residuals().unwrap();
            let j = problem.jacobian().unwrap();
            let lm_grad = j.transpose() * &r * 2.0;

            let max_abs_diff = analytic_grad
                .iter()
                .zip(lm_grad.iter())
                .map(|(a, b)| (a - b).abs())
                .fold(0.0_f64, f64::max);
            let max_abs = analytic_grad
                .iter()
                .map(|x: &f64| x.abs())
                .fold(0.0_f64, f64::max);
            assert!(
                max_abs_diff < 1e-9 + 1e-8 * max_abs,
                "n={} seed={} loss={:?}: |2·Jᵀ·r − ∇L| = {:.3e} (max |∇L| = {:.3e})",
                n,
                seed,
                loss_type,
                max_abs_diff,
                max_abs
            );
        }
    }

    #[test]
    fn lm_rejects_non_least_squares_loss() {
        let circles = vec![
            Circle::new(Point::new(0.0, 0.0), 1.0),
            Circle::new(Point::new(1.5, 0.0), 1.0),
        ];
        let spec = spec_from_circles(&circles);
        let params = DVector::from_vec(flat_params(&circles));
        let result = LmDiagramProblem::<Circle>::new(
            &spec,
            Circle::n_params(),
            crate::loss::LossType::Stress,
            &params,
        );
        assert!(result.is_err(), "Stress loss should be rejected by LM");
    }
}