basin 0.2.0 - Docs.rs

use crate::core::math::{
    ComponentMulAssign, MatTransposeVec, MatVec, MatrixIdentity, NormSquared, RankOneUpdate,
    SampleStandardNormal, ScaleInPlace, ScaledAdd, SymmetricEigen, VectorLen,
};
use crate::core::problem::CostFunction;
use crate::core::rng::{ChaCha8Rng, SeedableRng};
use crate::core::solver::Solver;
use crate::core::state::BasicPopulationState;
use crate::core::termination::TerminationReason;

/// `(µ/µ_W, λ)`-CMA Evolution Strategy with negative weights (aCMA-ES)
/// from Hansen 2016 (*The CMA Evolution Strategy: A Tutorial*).
///
/// Stochastic, derivative-free, population-based — the standard
/// black-box optimizer for ill-conditioned, non-separable, non-convex
/// continuous problems. Uses a multivariate normal `N(m, σ²C)` to
/// sample candidates, then adapts `m`, `σ`, and the covariance `C` from
/// the selected best `µ` candidates plus their conjugate evolution
/// path. Hansen 2016 Figure 6 / eqs (38)–(47) is the algorithm-summary
/// fixture; section A is the parameter table.
///
/// # Algorithm
///
/// At [`init`](Solver::init): set `m = initial_mean`,
/// `σ = initial_sigma`, `p_σ = p_c = 0`, `C = I`, and sample the first
/// generation `x_k = m + σ z_k` (since `B D = I`) with `z_k ~ N(0, I)`.
///
/// Each [`next_iter`](Solver::next_iter) processes the previous
/// generation's evaluations and samples a fresh generation:
///
/// ```text
/// generation ← generation + 1
///
/// # use sorted x_{i:λ} from previous generation (state.candidates)
/// y_{i:λ} = (x_{i:λ} − m) / σ
/// ⟨y⟩_w = Σ_{i=1..µ} w_i y_{i:λ}                          # eq. 41
/// m ← m + c_m σ ⟨y⟩_w  (with c_m = 1)                     # eq. 42
///
/// # step-size: conjugate path + log-update
/// C^{−1/2} ⟨y⟩_w = B (D^{−1} ⊙ Bᵀ ⟨y⟩_w)
/// p_σ ← (1−c_σ) p_σ + √(c_σ(2−c_σ) µ_eff) · C^{−1/2} ⟨y⟩_w  # eq. 43
/// σ ← σ · exp((c_σ/d_σ) (‖p_σ‖ / E‖N(0,I)‖ − 1))           # eq. 44
///
/// # rank-1 + rank-µ update (with negative-weight rescaling)
/// h_σ = 1 iff ‖p_σ‖ / √(1−(1−c_σ)^(2(g+1))) < (1.4+2/(n+1))·E‖N(0,I)‖
/// p_c ← (1−c_c) p_c + h_σ √(c_c(2−c_c) µ_eff) ⟨y⟩_w        # eq. 45
/// w_i° = w_i if w_i ≥ 0 else w_i · n / ‖C^{−1/2} y_{i:λ}‖²  # eq. 46
/// δ_h = (1−h_σ) c_c (2−c_c)
/// C ← (1 + c_1 δ_h − c_1 − c_µ Σ w_j) C
///     + c_1 p_c p_cᵀ + c_µ Σ_i w_i° y_{i:λ} y_{i:λ}ᵀ        # eq. 47
///
/// # refresh eigendecomposition of new C → (B, d²)
/// d_i ← max(d²_i, 0)^(1/2);  d_i^{−1} ← 1 / d_i
///
/// # sample new generation
/// for k = 1..λ:  z_k ~ N(0, I);  x_k = m + σ B (d ⊙ z_k)
/// ```
///
/// The eigendecomposition is refreshed every iteration. Hansen's
/// suggested optimization (eigendecompose every `max(1, ⌊1/(10n(c_1+c_µ))⌋)`
/// generations, Appendix B.2) is deferred — at small to moderate `n`
/// the cost is dominated by `f` evaluations anyway, and the refresh
/// frequency would change the per-iteration cost calculus.
///
/// # Default parameters
///
/// All defaults follow Hansen 2016 Table 1 (the 2016 negative-weights
/// setting); see [`new`](Self::new) and the per-field doc comments
/// below for the exact formulas. The user supplies only `n` (via the
/// initial mean's length), the initial mean, the initial step-size,
/// and the seed.
///
/// # Reproducibility
///
/// The solver carries a [`ChaCha8Rng`] seeded from the `seed: u64`
/// passed to [`new`](Self::new) — same seed → same iterate trajectory
/// on every platform basin builds for (including
/// `wasm32-unknown-unknown`).
///
/// # Contract
///
/// - **Caller must:** implement [`CostFunction<Param = V, Output = f64>`]
///   on the problem. CMA-ES is derivative-free; no [`Gradient`] /
///   [`Jacobian`] required.
/// - **Caller must:** hand in a [`BasicPopulationState::with_size(λ)`]
///   matching the solver's λ. The default
///   λ = `4 + ⌊3 ln n⌋` is exposed via [`default_lambda`](Self::default_lambda).
/// - **Caller must:** ensure `initial_sigma > 0`.
/// - **Implementor (this solver) must:** maintain the
///   [`PopulationState`](crate::core::state::PopulationState)
///   sorted-by-cost invariant on `state.candidates` / `state.costs`
///   at the start and end of every iteration, and seed `state.cost()`
///   with the best of the first sampled generation.
///
/// # Termination
///
/// Solver-internal: `σ · max d_i < tol_x` → [`TerminationReason::SolverConverged`]
/// (CMA-ES TolX, Hansen 2016 Appendix B.3). Defaults to
/// `1e−12 · initial_sigma` per Hansen's recommendation. Pair with the
/// framework's [`MaxIter`](crate::core::termination::MaxIter) /
/// [`MaxCostEvals`](crate::core::termination::MaxCostEvals) for budget
/// control; both work on
/// [`BasicPopulationState`](crate::core::state::BasicPopulationState)
/// without modification. Other CMA-ES termination heuristics
/// (NoEffectAxis, NoEffectCoord, ConditionCov, EqualFunValues,
/// Stagnation, TolXUp, TolFun) are out of scope for S8 vanilla and
/// will land alongside the bounded variant in S9 / restart machinery
/// in S11.
///
/// # Backends
///
/// LA-heavy: requires symmetric eigendecomposition, scalar-and-rank-1
/// matrix updates, and matrix-vector / transposed matrix-vector
/// products. Wired and tested for `nalgebra::DVector<f64>` /
/// `nalgebra::DMatrix<f64>` (feature `nalgebra`) and `faer::Col<f64>` /
/// `faer::Mat<f64>` (feature `faer`). `Vec<f64>` and `ndarray` produce
/// a compile-time error per tenet 5 (no honest matrix type or no
/// pure-Rust eigendecomposition). Sparse covariance is not meaningful
/// for CMA-ES — the rank-µ update densifies any starting pattern.
pub struct CmaEs<V, M> {
    initial_mean: V,
    initial_sigma: f64,
    lambda_override: Option<usize>,
    seed: u64,
    tol_x_override: Option<f64>,

    state: Option<Working<V, M>>,
}

/// Solver-internal mutable state, populated in [`Solver::init`] and
/// updated each [`Solver::next_iter`].
///
/// `pub(crate)` (not public) so sibling solvers in `crate::solver` can
/// read the post-update `m`, `σ`, `B`, `D^{-1}` they need for
/// injection-style composition (`CmaInject` uses these to clip injected
/// `y_i` in Mahalanobis distance per Hansen 2011 eq. 4). Not a stable
/// public surface.
pub(crate) struct Working<V, M> {
    // --- constants (computed once at init) ---
    pub(crate) n: usize,
    lambda: usize,
    mu: usize,
    /// All λ recombination weights (sum of positives = 1; negatives
    /// scaled per Hansen Table 1 rows (50)–(53)).
    weights: Vec<f64>,
    /// `µ_eff = (Σ_{i=1..µ} w_i)² / Σ_{i=1..µ} w_i² = 1 / Σ w_i²`
    /// because the positive weights sum to 1.
    mu_eff: f64,
    /// `Σ_{i=1..λ} w_i`. Negative when negative weights are in use
    /// (default setting); the C-update scalar `1 − c_µ · sum_w`
    /// inflates rather than decays C as a result. With Hansen's
    /// `α_µ_minus = 1 + c_1/c_µ` choice, `c_1 + c_µ · sum_w ≈ 0`,
    /// so the C scalar is approximately 1 (eq. 47).
    sum_w: f64,
    c_sigma: f64,
    d_sigma: f64,
    c_c: f64,
    c_1: f64,
    c_mu: f64,
    expected_norm: f64,
    /// `(1.4 + 2/(n+1)) · E‖N(0,I)‖` — RHS of the h_σ test (eq. 47
    /// callout footnote / Hansen 2016 p. 31).
    h_sigma_threshold: f64,
    tol_x: f64,

    // --- mutable iterate ---
    pub(crate) m: V,
    pub(crate) sigma: f64,
    p_sigma: V,
    p_c: V,
    c: M,
    /// Eigenvectors of `c` from the most recent eigendecomposition.
    pub(crate) b: M,
    /// Square roots of eigenvalues (the diagonal `D` in Hansen's
    /// `B D Bᵀ` factorization).
    d: V,
    /// Reciprocals of `d`, used for `C^{−1/2} = B D^{−1} Bᵀ`.
    pub(crate) d_inv: V,

    rng: ChaCha8Rng,
    /// Generation counter for the h_σ formula (Hansen 2016 p. 31:
    /// uses `(1−c_σ)^{2(g+1)}` in the bound). Incremented at the top
    /// of every [`Solver::next_iter`].
    generation: u64,
}

impl<V, M> CmaEs<V, M> {
    /// Build a CMA-ES with the default population size
    /// `λ = 4 + ⌊3 ln n⌋` (Hansen 2016 eq. 48), the default TolX
    /// `tol_x = 1e−12 · initial_sigma`, and a seeded RNG.
    ///
    /// # Panics
    ///
    /// Panics if `initial_sigma ≤ 0`.
    pub fn new(initial_mean: V, initial_sigma: f64, seed: u64) -> Self {
        assert!(
            initial_sigma > 0.0,
            "CmaEs requires initial_sigma > 0, got {}",
            initial_sigma
        );
        Self {
            initial_mean,
            initial_sigma,
            lambda_override: None,
            seed,
            tol_x_override: None,
            state: None,
        }
    }

    /// Override the default population size. The default
    /// `4 + ⌊3 ln n⌋` is what Hansen's tutorial recommends and is
    /// honest for general black-box use; increasing `λ` improves
    /// global-search robustness at the cost of per-iter convergence
    /// rate (Hansen 2016 Section A *Default Parameters*).
    ///
    /// # Panics
    ///
    /// Panics if `lambda < 4`. Smaller populations are explicitly
    /// not recommended (Hansen 2016 footnote 30: "Decreasing λ is not
    /// recommended").
    pub fn with_lambda(mut self, lambda: usize) -> Self {
        assert!(
            lambda >= 4,
            "CmaEs requires lambda >= 4, got {} (Hansen 2016 footnote 30: \
             smaller populations have strong adverse effects on performance)",
            lambda
        );
        self.lambda_override = Some(lambda);
        self
    }

    /// Override the default TolX (`1e−12 · initial_sigma`). The check
    /// fires when `σ · max_i d_i < tol_x`, where `d_i` are square
    /// roots of `C`'s eigenvalues — i.e. the largest standard
    /// deviation of any axis of the search distribution drops below
    /// the tolerance. Hansen 2016 Appendix B.3 default is
    /// `1e−12 · initial_sigma`.
    pub fn with_tol_x(mut self, tol_x: f64) -> Self {
        self.tol_x_override = Some(tol_x);
        self
    }

    /// Default population size for dimension `n`: `4 + ⌊3 ln n⌋`
    /// (Hansen 2016 eq. 48). Exposed so callers building a
    /// [`BasicPopulationState::with_size`] can match the solver's
    /// internal default without re-deriving the formula.
    pub fn default_lambda(n: usize) -> usize {
        4 + (3.0 * (n as f64).ln()).floor() as usize
    }

    /// Read-only access to the post-update CMA-ES iterate (`m`, `σ`,
    /// `B`, `D^{-1}`, `n`), used by sibling solvers that compose with
    /// CMA-ES — currently only `CmaInject`, which needs `C^{-1/2} =
    /// B D^{-1} Bᵀ` to clip injected `y_i` per Hansen 2011 eq. 4.
    /// `None` before [`Solver::init`] has run.
    pub(crate) fn working(&self) -> Option<&Working<V, M>> {
        self.state.as_ref()
    }
}

/// Asymptotic expansion of `E‖N(0, I_n)‖ = √2 Γ((n+1)/2) / Γ(n/2)`.
/// Accurate to ~10 digits for `n ≥ 1`; avoids needing `lgamma` (which
/// is not in stable `std`).
pub(crate) fn expected_norm_n01(n: usize) -> f64 {
    let n = n as f64;
    n.sqrt() * (1.0 - 1.0 / (4.0 * n) + 1.0 / (21.0 * n * n))
}

/// Compute the recombination weights and derived constants per
/// Hansen 2016 Table 1 rows (49)–(53), plus `µ_eff` and `µ_eff_neg`.
/// Returns `(weights, mu_eff, sum_w)`.
pub(crate) fn compute_weights(
    n: usize,
    lambda: usize,
    c_1: f64,
    c_mu: f64,
) -> (Vec<f64>, f64, f64) {
    let mu = lambda / 2;
    // Raw preliminary weights w_i' = ln((λ+1)/2) − ln i (eq. 49).
    let raw: Vec<f64> = (1..=lambda)
        .map(|i| ((lambda as f64 + 1.0) / 2.0).ln() - (i as f64).ln())
        .collect();

    // Positive sum and negative sum (over raw values).
    let sum_pos: f64 = raw[..mu].iter().sum();
    // µ_eff is defined on the *positive* weights only and is invariant
    // under positive-rescaling, so compute it from raw[..mu] (eq. 8 /
    // Table 1 caption).
    let raw_pos_norm_sq: f64 = raw[..mu].iter().map(|w| w * w).sum();
    let mu_eff = sum_pos.powi(2) / raw_pos_norm_sq;

    // µ_eff_neg from negative-portion raws (Table 1 caption).
    let sum_neg: f64 = raw[mu..].iter().sum();
    let raw_neg_norm_sq: f64 = raw[mu..].iter().map(|w| w * w).sum();
    let mu_eff_neg = if raw_neg_norm_sq > 0.0 {
        sum_neg.powi(2) / raw_neg_norm_sq
    } else {
        0.0
    };

    // Three bounds on the negative-weight scale (eqs. 50–52).
    let alpha_mu_minus = 1.0 + c_1 / c_mu;
    let alpha_mu_eff_minus = 1.0 + 2.0 * mu_eff_neg / (mu_eff + 2.0);
    let alpha_pos_def_minus = (1.0 - c_1 - c_mu) / (n as f64 * c_mu);
    let alpha_neg = alpha_mu_minus
        .min(alpha_mu_eff_minus)
        .min(alpha_pos_def_minus);

    // Final weights (eq. 53):
    // - positive: w_i = w_i' / Σ|w_j'|+ (positives sum to 1).
    // - negative: w_i = (alpha_neg / Σ|w_j'|−) · w_i'.
    let sum_abs_neg: f64 = raw[mu..].iter().map(|w| -w).sum();
    let mut weights = Vec::with_capacity(lambda);
    for (i, &raw_i) in raw.iter().enumerate() {
        let w = if i < mu {
            raw_i / sum_pos
        } else if sum_abs_neg > 0.0 {
            alpha_neg * raw_i / sum_abs_neg
        } else {
            0.0
        };
        weights.push(w);
    }

    let sum_w: f64 = weights.iter().sum();
    (weights, mu_eff, sum_w)
}

impl<V, M> CmaEs<V, M>
where
    V: VectorLen + Clone,
    M: MatrixIdentity,
{
    /// Build [`Working`] from `self`'s user-provided settings. Called
    /// once from [`Solver::init`].
    fn build_working(&self) -> Working<V, M> {
        let n = self.initial_mean.vec_len();
        assert!(n >= 1, "CmaEs requires the initial mean to be non-empty");
        let lambda = self
            .lambda_override
            .unwrap_or_else(|| Self::default_lambda(n));
        let mu = lambda / 2;
        // Hansen Table 1 rows (55)–(58).
        let alpha_cov = 2.0;
        // The c_1 / c_µ formulas need µ_eff, which depends on positive
        // weights only. Compute µ_eff once from the raw weights to feed
        // c_1 / c_µ, then re-derive the final negative weights against
        // those c_1 / c_µ via `compute_weights` (Hansen explains the
        // apparent circular dependency in Appendix A: µ_eff is invariant
        // under positive-weight rescaling, so a one-shot computation
        // suffices).
        let raw: Vec<f64> = (1..=lambda)
            .map(|i| ((lambda as f64 + 1.0) / 2.0).ln() - (i as f64).ln())
            .collect();
        let sum_pos: f64 = raw[..mu].iter().sum();
        let mu_eff_provisional = sum_pos.powi(2) / raw[..mu].iter().map(|w| w * w).sum::<f64>();

        let c_1 = alpha_cov / ((n as f64 + 1.3).powi(2) + mu_eff_provisional);
        let c_mu_unbounded = alpha_cov * (mu_eff_provisional - 2.0 + 1.0 / mu_eff_provisional)
            / ((n as f64 + 2.0).powi(2) + alpha_cov * mu_eff_provisional / 2.0);
        let c_mu = (1.0 - c_1).min(c_mu_unbounded);

        let (weights, mu_eff, sum_w) = compute_weights(n, lambda, c_1, c_mu);

        let c_sigma = (mu_eff + 2.0) / (n as f64 + mu_eff + 5.0);
        // d_σ = 1 + 2 · max(0, √((µ_eff−1)/(n+1)) − 1) + c_σ
        // (Hansen 2016 Table 1 row 55).
        let d_sigma = {
            let inner = ((mu_eff - 1.0) / (n as f64 + 1.0)).sqrt() - 1.0;
            1.0 + 2.0 * inner.max(0.0) + c_sigma
        };
        let c_c = (4.0 + mu_eff / n as f64) / (n as f64 + 4.0 + 2.0 * mu_eff / n as f64);

        let expected_norm = expected_norm_n01(n);
        let h_sigma_threshold = (1.4 + 2.0 / (n as f64 + 1.0)) * expected_norm;
        let tol_x = self.tol_x_override.unwrap_or(1e-12 * self.initial_sigma);

        // Initial mutable state. The vectors p_σ, p_c, d, d_inv are
        // sized like `initial_mean` via clone; their values are
        // overwritten in `init` (zeros for the paths, ones for the
        // d-vectors corresponding to C = I).
        Working {
            n,
            lambda,
            mu,
            weights,
            mu_eff,
            sum_w,
            c_sigma,
            d_sigma,
            c_c,
            c_1,
            c_mu,
            expected_norm,
            h_sigma_threshold,
            tol_x,
            m: self.initial_mean.clone(),
            sigma: self.initial_sigma,
            p_sigma: self.initial_mean.clone(),
            p_c: self.initial_mean.clone(),
            c: M::identity(n),
            b: M::identity(n),
            d: self.initial_mean.clone(),
            d_inv: self.initial_mean.clone(),
            rng: ChaCha8Rng::seed_from_u64(self.seed),
            generation: 0,
        }
    }
}

/// Sort `candidates` and `costs` jointly by ascending cost. NaN costs
/// sort last (mirrors `nelder_mead::sort_simplex` /
/// `random_search::sort_population_ascending`).
pub(crate) fn sort_population_ascending<V>(candidates: &mut [V], costs: &mut [f64]) {
    let n = candidates.len();
    debug_assert_eq!(n, costs.len());
    let mut idx: Vec<usize> = (0..n).collect();
    idx.sort_by(|&i, &j| {
        costs[i]
            .partial_cmp(&costs[j])
            .unwrap_or(std::cmp::Ordering::Equal)
    });
    apply_permutation(candidates, &idx);
    apply_permutation(costs, &idx);
}

fn apply_permutation<T>(slice: &mut [T], idx: &[usize]) {
    let mut visited = vec![false; slice.len()];
    for start in 0..slice.len() {
        if visited[start] || idx[start] == start {
            visited[start] = true;
            continue;
        }
        let mut current = start;
        loop {
            let next = idx[current];
            visited[current] = true;
            if next == start {
                break;
            }
            slice.swap(current, next);
            current = next;
        }
    }
}

impl<P, V, M> Solver<P, BasicPopulationState<V>> for CmaEs<V, M>
where
    P: CostFunction<Param = V, Output = f64>,
    V: VectorLen
        + Clone
        + ScaledAdd<f64>
        + ScaleInPlace
        + ComponentMulAssign
        + NormSquared
        + SampleStandardNormal
        + std::ops::Index<usize, Output = f64>
        + std::ops::IndexMut<usize, Output = f64>,
    M: MatrixIdentity
        + MatVec<V>
        + MatTransposeVec<V>
        + ScaleInPlace
        + RankOneUpdate<V>
        + SymmetricEigen<V>
        + Clone,
{
    fn init(&mut self, problem: &P, mut state: BasicPopulationState<V>) -> BasicPopulationState<V> {
        // Idempotent: if a previous init already seeded the internal
        // state, return the caller-provided state untouched. This lets
        // chain-style outer solvers (e.g. MaLsChCma) call `run_loop`
        // repeatedly on a paused CmaEs without clobbering its evolution
        // state on every entry. For non-resumption use this is a no-op:
        // a freshly constructed CmaEs has `self.state == None` and
        // proceeds through the full setup below.
        if self.state.is_some() {
            return state;
        }
        let mut w = self.build_working();
        // Zero the path vectors and seed (d, d_inv) = (1, 1, …, 1) since C = I.
        w.p_sigma.scale_in_place(0.0);
        w.p_c.scale_in_place(0.0);
        for i in 0..w.n {
            w.d[i] = 1.0;
            w.d_inv[i] = 1.0;
        }

        // First generation. C = I so y_k = z_k; x_k = m + σ z_k.
        state.candidates.clear();
        state.costs.clear();
        for _k in 0..w.lambda {
            let z_k = V::sample_standard_normal(&w.m, &mut w.rng);
            let mut x_k = w.m.clone();
            x_k.scaled_add(w.sigma, &z_k);
            let cost = problem.cost(&x_k);
            state.candidates.push(x_k);
            state.costs.push(cost);
        }
        state.cost_evals += w.lambda as u64;
        sort_population_ascending(&mut state.candidates, &mut state.costs);

        self.state = Some(w);
        state
    }

    fn next_iter(
        &mut self,
        problem: &P,
        mut state: BasicPopulationState<V>,
    ) -> (BasicPopulationState<V>, Option<TerminationReason>) {
        let w = self
            .state
            .as_mut()
            .expect("CmaEs::init must run before next_iter");

        w.generation += 1;

        // Rebuild y_{i:λ} = (x_{i:λ} − m) / σ for the *previous* m, σ.
        // (state.candidates carries the most recent generation's x's,
        // sorted ascending by cost.)
        let mut y_sorted: Vec<V> = state
            .candidates
            .iter()
            .map(|x| {
                let mut y = x.clone();
                y.scaled_add(-1.0, &w.m);
                y.scale_in_place(1.0 / w.sigma);
                y
            })
            .collect();

        // ⟨y⟩_w = Σ_{i=1..µ} w_i y_{i:λ}.
        let mut y_w = w.m.clone();
        y_w.scale_in_place(0.0);
        for (i, y_i) in y_sorted.iter().enumerate().take(w.mu) {
            y_w.scaled_add(w.weights[i], y_i);
        }

        // m ← m + σ ⟨y⟩_w (c_m = 1 by default).
        w.m.scaled_add(w.sigma, &y_w);

        // C^{−1/2} ⟨y⟩_w = B (D^{−1} ⊙ Bᵀ ⟨y⟩_w).
        let mut bt_y_w = w.b.mat_transpose_vec(&y_w);
        bt_y_w.component_mul_assign(&w.d_inv);
        let c_invsqrt_y_w = w.b.matvec(&bt_y_w);

        // p_σ ← (1 − c_σ) p_σ + √(c_σ(2 − c_σ) µ_eff) C^{−1/2} ⟨y⟩_w.
        w.p_sigma.scale_in_place(1.0 - w.c_sigma);
        let coef_sigma = (w.c_sigma * (2.0 - w.c_sigma) * w.mu_eff).sqrt();
        w.p_sigma.scaled_add(coef_sigma, &c_invsqrt_y_w);

        // σ ← σ exp((c_σ / d_σ) (‖p_σ‖ / E‖N(0,I)‖ − 1)).
        let p_sigma_norm = w.p_sigma.norm_squared().sqrt();
        let log_factor = (w.c_sigma / w.d_sigma) * (p_sigma_norm / w.expected_norm - 1.0);
        w.sigma *= log_factor.exp();

        // h_σ test (Hansen 2016 p. 31, denominator uses 2(g+1)).
        let g_for_h = (w.generation + 1) as i32;
        let exponent = 2 * g_for_h;
        let denom = (1.0 - (1.0 - w.c_sigma).powi(exponent)).sqrt();
        let h_sigma = if p_sigma_norm / denom < w.h_sigma_threshold {
            1.0
        } else {
            0.0
        };

        // p_c ← (1 − c_c) p_c + h_σ √(c_c(2 − c_c) µ_eff) ⟨y⟩_w.
        w.p_c.scale_in_place(1.0 - w.c_c);
        let coef_c = h_sigma * (w.c_c * (2.0 - w.c_c) * w.mu_eff).sqrt();
        w.p_c.scaled_add(coef_c, &y_w);

        // C update (eq. 47):
        //   C ← (1 + c_1 δ_h − c_1 − c_µ Σ w_j) C
        //       + c_1 p_c p_cᵀ
        //       + c_µ Σ_i w_i° y_{i:λ} y_{i:λ}ᵀ
        // with w_i° = w_i for w_i ≥ 0, else w_i · n / ‖C^{−1/2} y_{i:λ}‖².
        let delta_h = (1.0 - h_sigma) * w.c_c * (2.0 - w.c_c);
        let c_scale = 1.0 + w.c_1 * delta_h - w.c_1 - w.c_mu * w.sum_w;
        w.c.scale_in_place(c_scale);
        w.c.rank_one_update(w.c_1, &w.p_c);
        // Negative-weight path rescales by n / ‖C^{−1/2} y_i‖²;
        // positive-weight path uses w_i directly (eq. 46).
        for (i, y_i) in y_sorted.iter().enumerate() {
            let w_i = w.weights[i];
            let w_i_o = if w_i >= 0.0 {
                w_i
            } else {
                // ‖C^{−1/2} y_i‖² = ‖D^{−1} ⊙ Bᵀ y_i‖² (orthogonal B).
                let mut bt_y = w.b.mat_transpose_vec(y_i);
                bt_y.component_mul_assign(&w.d_inv);
                let cinv_norm_sq = bt_y.norm_squared();
                if cinv_norm_sq > 0.0 {
                    w_i * (w.n as f64) / cinv_norm_sq
                } else {
                    // Pathological zero-direction; drop this contribution.
                    0.0
                }
            };
            if w_i_o != 0.0 {
                w.c.rank_one_update(w.c_mu * w_i_o, y_i);
            }
        }
        // Drop y_sorted now to free memory before the eigendecomposition.
        drop(std::mem::take(&mut y_sorted));

        // Refresh eigendecomposition of the new C.
        let (b_new, eigs) = match w.c.try_eigh() {
            Ok(pair) => pair,
            Err(_) => return (state, Some(TerminationReason::SolverFailed)),
        };
        w.b = b_new;
        // d_i = √max(λ_i, 0); d_inv_i = 1/d_i. Floating-point can produce
        // tiny negative eigenvalues even when the algorithm preserves
        // positive definiteness; clamp to a small positive floor before
        // taking the square root.
        for i in 0..w.n {
            let lam = eigs[i].max(1e-30);
            let s = lam.sqrt();
            w.d[i] = s;
            w.d_inv[i] = 1.0 / s;
        }

        // Sample new generation: x_k = m + σ B (D ⊙ z_k).
        state.candidates.clear();
        state.costs.clear();
        for _k in 0..w.lambda {
            let z_k = V::sample_standard_normal(&w.m, &mut w.rng);
            let mut bd_z = z_k;
            bd_z.component_mul_assign(&w.d);
            let bd_z = w.b.matvec(&bd_z);
            let mut x_k = w.m.clone();
            x_k.scaled_add(w.sigma, &bd_z);
            let cost = problem.cost(&x_k);
            state.candidates.push(x_k);
            state.costs.push(cost);
        }
        state.cost_evals += w.lambda as u64;
        sort_population_ascending(&mut state.candidates, &mut state.costs);

        (state, None)
    }

    fn terminate(&self, _state: &BasicPopulationState<V>) -> Option<TerminationReason> {
        let w = self.state.as_ref()?;
        // TolX (Hansen 2016 Appendix B.3): stop when the largest
        // standard deviation of any axis of the search distribution
        // drops below `tol_x`. `σ · max_i d_i` is the largest
        // single-axis standard deviation.
        let max_d = w.d_iter_max();
        if w.sigma * max_d < w.tol_x {
            return Some(TerminationReason::SolverConverged);
        }
        None
    }
}

impl<V, M> Working<V, M>
where
    V: std::ops::Index<usize, Output = f64> + VectorLen,
{
    fn d_iter_max(&self) -> f64 {
        let mut m = 0.0_f64;
        for i in 0..self.n {
            let v = self.d[i];
            if v > m {
                m = v;
            }
        }
        m
    }
}