gam-geometry 0.3.130

//! Closure as an estimand: a continuous circle ⇄ interval topology family (#1015).
//!
//! The topology race treats "circle" and "open interval/line" as separate
//! discrete candidates. For a smooth 1-D chart that distinction is not
//! discrete: it is a single continuous *closure* parameter `γ` saying how much
//! of the chart is actually closed. The result is a profile-likelihood interval
//! for `γ` rather than a hard circle-vs-line vote.
//!
//! ## The period-extension chart
//!
//! On the observed coordinate window `s ∈ [0, 2π]` the basis is
//!
//! ```text
//! Φ_m(s; γ) = [cos(m γ s), sin(m γ s)],   γ = 2π / L ∈ [0, 1].
//! ```
//!
//! * `γ = 1`: the window is one full period, endpoints are identified — the
//!   current circle.
//! * `0 < γ < 1`: the data occupy an arc of a larger periodic chart, so the
//!   endpoint seam is not forced closed.
//! * `γ = 0`: the removable interval/Taylor limit — `sin(m γ s)/(m γ) → s`,
//!   `1 − cos(m γ s) → ½ m² γ² s²`, so after the rank-stabilising gauge the
//!   columns become an interval (polynomial) basis.
//!
//! This is the **support-moving** version. The cheap MVP, implemented in
//! [`crate::terms::basis::cyclic`] as a boundary-conductance penalty, is the
//! penalty-moving version `S(γ) = S_open + c(γ)·S_wrap` with `c(0)=0, c(1)=1`.
//!
//! ## Why this is the #944 pattern
//!
//! Exactly like [`crate::manifolds::constant_curvature`]'s `M_κ`, `γ` is one
//! parameter with stable removable limits whose basis, penalty, and evidence
//! derivatives flow through `Tower4<1>` jets, so the parameter joins the same
//! ψ-channel the curvature does. `cos`/`sin` are entire, so the γ-jet of the
//! basis is just `compose_unary` with the trig derivative stack; the removable
//! issue is rank loss near `γ = 0`, fixed by the analytic QR gauge, not by the
//! trig evaluation.
//!
//! ## Composition with the discrete race
//!
//! This replaces the artificial smooth-vs-smooth circle/line race *inside* the
//! smooth class. It does not replace the #907 mixture/union rungs: a genuine
//! finite cluster is a singular support-collapse boundary, not a regular
//! interior point of this 1-D family, and the boundary is exposed honestly
//! (`γ` pinned at 0 with collapsed effective range ⇒ a "not a smooth 1-D
//! topology" diagnostic handed to the mixture rung).

use ndarray::{Array1, Array2, ArrayView1};
use wide::f64x4;

/// The continuous closure family on the window `[0, window]`.
///
/// `harmonics` is the number of Fourier pairs `m = 1..=harmonics` (plus the
/// constant column), matching the cyclic basis order. `window` is the observed
/// coordinate span (`2π` in the canonical chart).
#[derive(Clone, Debug)]
pub struct ClosureFamily {
    /// Number of harmonic pairs.
    pub harmonics: usize,
    /// Observed window length `[0, window]`.
    pub window: f64,
}

/// Seed the stable trigonometric recurrence for the base angle `φ`.
///
/// Returns `(α, β, cos φ, sin φ)` with `α = 2·sin²(φ/2)` and `β = sin φ`, computed
/// from a single `sin_cos(φ/2)`. The `α = 2·sin²(φ/2)` form (rather than `1 −
/// cos φ`) avoids cancellation near `φ = 0`, which is what makes the recurrence
/// `c_{m+1} = c_m − (α·c_m + β·s_m)`, `s_{m+1} = s_m − (α·s_m − β·c_m)`
/// numerically stable (Singleton; Numerical Recipes §5.5).
#[inline]
fn recurrence_seed(phi: f64) -> (f64, f64, f64, f64) {
    let (sh, ch) = (0.5 * phi).sin_cos();
    let alpha = 2.0 * sh * sh; // 2 sin²(φ/2) = 1 − cos φ
    let beta = 2.0 * sh * ch; // sin φ
    let cos_phi = ch * ch - sh * sh; // cos φ = cos²(φ/2) − sin²(φ/2)
    (alpha, beta, cos_phi, beta)
}

impl ClosureFamily {
    /// Build a closure family of `harmonics` Fourier pairs on `[0, window]`.
    pub fn new(harmonics: usize, window: f64) -> Self {
        Self { harmonics, window }
    }

    /// Number of raw basis columns: constant + `2·harmonics` Fourier columns.
    #[inline]
    pub fn raw_dim(&self) -> usize {
        1 + 2 * self.harmonics
    }

    /// Write the value / `∂Φ/∂γ` / `∂²Φ/∂γ²` columns of one row directly into
    /// caller-provided slices (each length `raw_dim`, pre-zeroed).
    ///
    /// ## Why this beats the per-harmonic transcendental
    ///
    /// The angle `θ_m = m·γ·s` is **affine in γ** (`∂θ_m/∂γ = m·s`, `∂²θ_m/∂γ² =
    /// 0`), so the entire γ-jet of a column is a fixed scaling of its value:
    /// `cos` column `(cos θ_m, −sin θ_m·m·s, −cos θ_m·(m·s)²)`, `sin` column
    /// `(sin θ_m, cos θ_m·m·s, −sin θ_m·(m·s)²)`. The only transcendental work is
    /// therefore the `cos θ_m`/`sin θ_m` ladder for `m = 1..=H`.
    ///
    /// The earlier form called `sin_cos` once **per harmonic** — `H` libm
    /// transcendentals per row, each on the progressively larger argument
    /// `m·γ·s`. We instead seed a single `sin_cos(φ/2)` (`φ = γ·s`) and run the
    /// numerically stable trigonometric recurrence (Singleton / Numerical
    /// Recipes §5.5):
    ///
    /// ```text
    /// α = 2·sin²(φ/2),  β = sin φ
    /// c_{m+1} = c_m − (α·c_m + β·s_m)     [= cos((m+1)φ)]
    /// s_{m+1} = s_m − (α·s_m − β·c_m)     [= sin((m+1)φ)]
    /// ```
    ///
    /// One transcendental per row instead of `H`, ~2–2.6× faster. Because the
    /// recurrence never forms the large argument `m·γ·s` (whose unavoidable f64
    /// rounding is `ε·m·γ·s`), it is in fact **more accurate** than the old
    /// per-harmonic libm calls: across 2000 inputs × `H ∈ {4,8,16,32,64,128}`
    /// its max absolute error vs an extended-precision (double-double) reference
    /// is 0.72–0.92× that of the old form at every `H` (see the
    /// `recurrence_is_at_least_as_accurate_as_per_harmonic_libm` oracle). This
    /// is a reassociation, so it is *not* bit-identical to the old form; the
    /// gate is accuracy-vs-truth, not bit reproduction.
    #[inline]
    fn write_row_jet(&self, s: f64, gamma: f64, value: &mut [f64], dg: &mut [f64], dgg: &mut [f64]) {
        value[0] = 1.0;
        if self.harmonics == 0 {
            return;
        }
        let (alpha, beta, mut cs, mut sn) = recurrence_seed(gamma * s);
        for m in 1..=self.harmonics {
            let ms = m as f64 * s; // ∂θ_m/∂γ
            let ci = 2 * m - 1;
            let si = 2 * m;
            // cos column: v=cos, ∂γ=-sin·θ_g, ∂²γ=-cos·θ_g².
            value[ci] = cs;
            dg[ci] = -sn * ms;
            dgg[ci] = (-cs * ms) * ms;
            // sin column: v=sin, ∂γ=cos·θ_g, ∂²γ=-sin·θ_g².
            value[si] = sn;
            dg[si] = cs * ms;
            dgg[si] = (-sn * ms) * ms;
            // Advance the stable recurrence to (m+1).
            let cn = cs - (alpha * cs + beta * sn);
            let sn1 = sn - (alpha * sn - beta * cs);
            cs = cn;
            sn = sn1;
        }
    }

    /// Value-only fast path: the `cos`/`sin` of one row (no γ-derivatives), via
    /// the same stable trigonometric recurrence as [`Self::write_row_jet`].
    #[inline]
    fn write_row_value(&self, s: f64, gamma: f64, value: &mut [f64]) {
        value[0] = 1.0;
        if self.harmonics == 0 {
            return;
        }
        let (alpha, beta, mut cs, mut sn) = recurrence_seed(gamma * s);
        for m in 1..=self.harmonics {
            value[2 * m - 1] = cs;
            value[2 * m] = sn;
            let cn = cs - (alpha * cs + beta * sn);
            let sn1 = sn - (alpha * sn - beta * cs);
            cs = cn;
            sn = sn1;
        }
    }

    /// Raw design row `Φ(s; γ) = [1, cos(γs), sin(γs), cos(2γs), …]` and its γ-jet.
    ///
    /// Returns `(value, d/dγ, d²/dγ²)` per column — the support-moving basis and
    /// its exact first/second closure derivatives in one pass. The constant
    /// column is γ-independent.
    pub fn row_jet(&self, s: f64, gamma: f64) -> (Array1<f64>, Array1<f64>, Array1<f64>) {
        let d = self.raw_dim();
        let mut value = Array1::zeros(d);
        let mut dg = Array1::zeros(d);
        let mut dgg = Array1::zeros(d);
        self.write_row_jet(
            s,
            gamma,
            value.as_slice_mut().expect("contiguous"),
            dg.as_slice_mut().expect("contiguous"),
            dgg.as_slice_mut().expect("contiguous"),
        );
        (value, dg, dgg)
    }

    /// Assemble the raw design `Φ(γ)` (n × raw_dim) over coordinates `s`.
    ///
    /// ## Why four rows per pass
    ///
    /// The stable recurrence is a serial dependency chain *within* a row
    /// (`(c_{m+1}, s_{m+1})` needs `(c_m, s_m)`), so a single row is
    /// latency-bound — each step waits on the previous mul→add. Rows are
    /// independent, though, so we run **four rows at once** in `wide::f64x4`
    /// lanes: four independent chains fill the pipeline and the recurrence
    /// becomes throughput-bound. Combined with the one-transcendental seed this
    /// measures ~4–6× the per-harmonic-libm baseline for the value path and
    /// ~2–4× for the heavier value+jet path (whose six scatter-stores per
    /// harmonic are store-bound and do not vectorise); the multiple widens on
    /// 4-wide-`f64` AVX2 hosts where a `f64x4` lane is a single instruction.
    /// Each lane is IEEE-`f64`, so the result is **bit-identical** to the scalar
    /// [`Self::write_row_value`] row-by-row (asserted by
    /// `simd_design_is_bit_identical_to_scalar_rows`).
    pub fn design(&self, s: ArrayView1<'_, f64>, gamma: f64) -> Array2<f64> {
        let n = s.len();
        let d = self.raw_dim();
        let h = self.harmonics;
        let mut phi = Array2::zeros((n, d));
        let pv = phi.as_slice_mut().expect("contiguous design");
        let mut i = 0;
        if h > 0 {
            while i + 4 <= n {
                let s4 = [s[i], s[i + 1], s[i + 2], s[i + 3]];
                let (alpha, beta, mut cc, mut sn) = seed_lanes(gamma, &s4);
                for l in 0..4 {
                    pv[(i + l) * d] = 1.0;
                }
                for m in 1..=h {
                    let (ci, si) = (2 * m - 1, 2 * m);
                    let cca = cc.to_array();
                    let sna = sn.to_array();
                    for l in 0..4 {
                        let base = (i + l) * d;
                        pv[base + ci] = cca[l];
                        pv[base + si] = sna[l];
                    }
                    let cn = cc - (alpha * cc + beta * sn);
                    let sn1 = sn - (alpha * sn - beta * cc);
                    cc = cn;
                    sn = sn1;
                }
                i += 4;
            }
        }
        // Scalar remainder (and the whole thing when h == 0).
        while i < n {
            self.write_row_value(s[i], gamma, &mut pv[i * d..i * d + d]);
            i += 1;
        }
        phi
    }

    /// Assemble the raw design and its first/second γ-derivative matrices in one
    /// pass: `(Φ, ∂Φ/∂γ, ∂²Φ/∂γ²)`, each n × raw_dim. Four rows per pass via
    /// `wide::f64x4` (see [`Self::design`]); bit-identical to scalar
    /// [`Self::write_row_jet`] row-by-row.
    pub fn design_jet(
        &self,
        s: ArrayView1<'_, f64>,
        gamma: f64,
    ) -> (Array2<f64>, Array2<f64>, Array2<f64>) {
        let n = s.len();
        let d = self.raw_dim();
        let h = self.harmonics;
        let mut phi = Array2::zeros((n, d));
        let mut dphi = Array2::zeros((n, d));
        let mut ddphi = Array2::zeros((n, d));
        let pv = phi.as_slice_mut().expect("contiguous design");
        let dv = dphi.as_slice_mut().expect("contiguous d/dγ");
        let ddv = ddphi.as_slice_mut().expect("contiguous d²/dγ²");
        let mut i = 0;
        if h > 0 {
            while i + 4 <= n {
                let s4 = [s[i], s[i + 1], s[i + 2], s[i + 3]];
                let (alpha, beta, mut cc, mut sn) = seed_lanes(gamma, &s4);
                let svec = f64x4::from(s4);
                for l in 0..4 {
                    pv[(i + l) * d] = 1.0;
                }
                for m in 1..=h {
                    let (ci, si) = (2 * m - 1, 2 * m);
                    let ms = svec * f64x4::splat(m as f64); // ∂θ_m/∂γ
                    // Same per-lane association as the scalar hand-fold.
                    let cca = cc.to_array();
                    let sna = sn.to_array();
                    let dgc = (-sn * ms).to_array();
                    let dgs = (cc * ms).to_array();
                    let ddc = ((-cc * ms) * ms).to_array();
                    let dds = ((-sn * ms) * ms).to_array();
                    for l in 0..4 {
                        let base = (i + l) * d;
                        pv[base + ci] = cca[l];
                        pv[base + si] = sna[l];
                        dv[base + ci] = dgc[l];
                        dv[base + si] = dgs[l];
                        ddv[base + ci] = ddc[l];
                        ddv[base + si] = dds[l];
                    }
                    let cn = cc - (alpha * cc + beta * sn);
                    let sn1 = sn - (alpha * sn - beta * cc);
                    cc = cn;
                    sn = sn1;
                }
                i += 4;
            }
        }
        while i < n {
            let lo = i * d;
            // Borrow the three row slices disjointly (separate backing arrays).
            self.write_row_jet(
                s[i],
                gamma,
                &mut pv[lo..lo + d],
                &mut dv[lo..lo + d],
                &mut ddv[lo..lo + d],
            );
            i += 1;
        }
        (phi, dphi, ddphi)
    }
}

/// Seed four independent recurrence lanes for base angles `φ_l = γ·s_l`.
///
/// Returns `(α, β, cos φ, sin φ)` as `f64x4` lanes. The per-lane `sin_cos(φ/2)`
/// is scalar (no SIMD transcendental), but it is `O(1)` per row and amortised
/// over the `H`-long recurrence. Lane `l` reproduces [`recurrence_seed`]
/// bit-for-bit.
#[inline]
fn seed_lanes(gamma: f64, s4: &[f64; 4]) -> (f64x4, f64x4, f64x4, f64x4) {
    let mut al = [0.0; 4];
    let mut be = [0.0; 4];
    let mut ca = [0.0; 4];
    let mut sa = [0.0; 4];
    for l in 0..4 {
        let (a, b, c, s) = recurrence_seed(gamma * s4[l]);
        al[l] = a;
        be[l] = b;
        ca[l] = c;
        sa[l] = s;
    }
    (
        f64x4::from(al),
        f64x4::from(be),
        f64x4::from(ca),
        f64x4::from(sa),
    )
}

/// The smooth penalty closure-coefficient `c(γ)` for the boundary-conductance
/// MVP `S(γ) = S_open + c(γ)·S_wrap`, with `c(0)=0, c(1)=1`, and its γ-jet.
///
/// A monotone `C²` interpolant that is flat at both endpoints (so the closure
/// derivative does not blow up at `γ = 0` or `γ = 1`): the smoothstep
/// `c(γ) = 3γ² − 2γ³`. Returns `(c, c′, c″)`.
pub fn boundary_conductance(gamma: f64) -> (f64, f64, f64) {
    let g = gamma.clamp(0.0, 1.0);
    let c = 3.0 * g * g - 2.0 * g * g * g;
    let cp = 6.0 * g - 6.0 * g * g;
    let cpp = 6.0 - 12.0 * g;
    (c, cp, cpp)
}

/// The boundary-conductance penalty `S(γ) = S_open + c(γ)·S_wrap` and its
/// first/second γ-derivatives, given the open and wrap penalty pieces.
///
/// `s_open` is the ordinary (open-interval) difference penalty; `s_wrap` is the
/// closing-edge rows that the cyclic difference penalty adds on top — i.e.
/// `S_circle = S_open + S_wrap`. At `γ = 1`, `c = 1` and the penalty is exactly
/// the cyclic penalty; at `γ = 0`, `c = 0` and it is the open penalty.
pub fn conductance_penalty_jet(
    s_open: &Array2<f64>,
    s_wrap: &Array2<f64>,
    gamma: f64,
) -> (Array2<f64>, Array2<f64>, Array2<f64>) {
    let (c, cp, cpp) = boundary_conductance(gamma);
    let s = s_open + &(s_wrap * c);
    let ds = s_wrap * cp;
    let dds = s_wrap * cpp;
    (s, ds, dds)
}

/// A profile-likelihood interval for the closure parameter.
///
/// `gamma_hat` is the profile minimiser of `V(γ) = V(θ̂(γ), γ)`; `ci_lo/ci_hi`
/// is the Wilks set `{ γ : 2[V(γ) − V(γ̂)] ≤ χ²₁(level) }`. The boundary
/// behaviour is honest: `ci_includes_circle` (1 in the CI) means the data do
/// not reject closure; `ci_includes_interval` (0 in the CI) means they do not
/// reject an interval. `singular_boundary` flags a γ pinned at 0 with collapsed
/// effective range — the "not a regular smooth topology" diagnostic that must
/// be routed to the #907 mixture/union rung rather than reported as a regular
/// closure estimate.
#[derive(Clone, Copy, Debug)]
pub struct ClosureProfileCi {
    /// Profile minimiser γ̂.
    pub gamma_hat: f64,
    /// Lower CI endpoint (clamped to `[0, 1]`).
    pub ci_lo: f64,
    /// Upper CI endpoint (clamped to `[0, 1]`).
    pub ci_hi: f64,
    /// CI contains `γ = 1` (closure not rejected).
    pub ci_includes_circle: bool,
    /// CI contains `γ = 0` (interval not rejected).
    pub ci_includes_interval: bool,
    /// γ̂ pinned at the singular cluster boundary — hand to the mixture rung.
    pub singular_boundary: bool,
}

/// χ²₁ quantile at the requested two-sided coverage `level` (e.g. 0.95).
///
/// `χ²₁(p) = (Φ⁻¹((1+p)/2))²`; we use the Acklam rational inverse-normal so the
/// CI driver carries no external dependency.
fn chi2_1_quantile(level: f64) -> f64 {
    let z = inv_std_normal(0.5 * (1.0 + level));
    z * z
}

/// Build a profile-likelihood CI from a grid of `(γ, V(γ))` profile evaluations.
///
/// The caller supplies the profiled negative-log-evidence `V(γ)` (with the
/// nuisance `θ` and `λ_smooth` already optimised at each γ — the issue's
/// requirement that γ and λ_smooth are confounded and must both be profiled).
/// The grid must be sorted ascending in γ and lie in `[0, 1]`.
pub fn profile_ci_from_grid(grid: &[(f64, f64)], level: f64) -> Result<ClosureProfileCi, String> {
    if grid.len() < 2 {
        return Err("closure profile CI needs at least two grid points".into());
    }
    let half_chi2 = 0.5 * chi2_1_quantile(level);

    // Profile minimiser.
    let (mut gamma_hat, mut v_min) = (grid[0].0, grid[0].1);
    for &(g, v) in grid {
        if !g.is_finite() || !v.is_finite() {
            return Err("closure profile grid has non-finite entries".into());
        }
        if v < v_min {
            v_min = v;
            gamma_hat = g;
        }
    }

    // Wilks set: contiguous-or-not membership by linear interpolation of the
    // crossing 2[V(γ) − V̂] = χ². We scan and record the widest interval that is
    // in the set and contains γ̂ (the regular case); endpoints are interpolated.
    let in_set = |v: f64| v - v_min <= half_chi2 + 1e-12;
    let mut ci_lo = gamma_hat;
    let mut ci_hi = gamma_hat;
    for w in grid.windows(2) {
        let (g0, v0) = w[0];
        let (g1, v1) = w[1];
        let (a0, a1) = (in_set(v0), in_set(v1));
        if a0 {
            ci_lo = ci_lo.min(g0);
            ci_hi = ci_hi.max(g0);
        }
        if a1 {
            ci_lo = ci_lo.min(g1);
            ci_hi = ci_hi.max(g1);
        }
        if a0 != a1 {
            // Linear crossing of the χ² threshold between g0 and g1.
            let target = v_min + half_chi2;
            let t = ((target - v0) / (v1 - v0)).clamp(0.0, 1.0);
            let g_cross = g0 + t * (g1 - g0);
            ci_lo = ci_lo.min(g_cross);
            ci_hi = ci_hi.max(g_cross);
        }
    }
    ci_lo = ci_lo.clamp(0.0, 1.0);
    ci_hi = ci_hi.clamp(0.0, 1.0);

    let ci_includes_circle = ci_hi >= 1.0 - 1e-9;
    let ci_includes_interval = ci_lo <= 1e-9;
    // Singular boundary: γ̂ at the floor AND the profile is flat-to-worse toward
    // the interior (the support-collapse signature — the family cannot improve
    // by opening up, so it wants to keep collapsing past γ = 0).
    let singular_boundary = gamma_hat <= 1e-9 && {
        // first interior point not better than the boundary by more than noise
        let interior = grid.iter().find(|&&(g, _)| g > 1e-9);
        interior.map(|&(_, v)| v >= v_min - 1e-9).unwrap_or(false)
    };

    Ok(ClosureProfileCi {
        gamma_hat,
        ci_lo,
        ci_hi,
        ci_includes_circle,
        ci_includes_interval,
        singular_boundary,
    })
}

/// Acklam's rational approximation to the inverse standard-normal CDF, refined
/// by one Halley step — accurate to ~1e-15, deterministic, dependency-free.
pub(crate) fn inv_std_normal(p: f64) -> f64 {
    if p <= 0.0 {
        return f64::NEG_INFINITY;
    }
    if p >= 1.0 {
        return f64::INFINITY;
    }
    const A: [f64; 6] = [
        -3.969_683_028_665_376e1,
        2.209_460_984_245_205e2,
        -2.759_285_104_469_687e2,
        1.383_577_518_672_690e2,
        -3.066_479_806_614_716e1,
        2.506_628_277_459_239e0,
    ];
    const B: [f64; 5] = [
        -5.447_609_879_822_406e1,
        1.615_858_368_580_409e2,
        -1.556_989_798_598_866e2,
        6.680_131_188_771_972e1,
        -1.328_068_155_288_572e1,
    ];
    const C: [f64; 6] = [
        -7.784_894_002_430_293e-3,
        -3.223_964_580_411_365e-1,
        -2.400_758_277_161_838e0,
        -2.549_732_539_343_734e0,
        4.374_664_141_464_968e0,
        2.938_163_982_698_783e0,
    ];
    const D: [f64; 4] = [
        7.784_695_709_041_462e-3,
        3.224_671_290_700_398e-1,
        2.445_134_137_142_996e0,
        3.754_408_661_907_416e0,
    ];
    const P_LOW: f64 = 0.024_25;
    let x = if p < P_LOW {
        let q = (-2.0 * p.ln()).sqrt();
        (((((C[0] * q + C[1]) * q + C[2]) * q + C[3]) * q + C[4]) * q + C[5])
            / ((((D[0] * q + D[1]) * q + D[2]) * q + D[3]) * q + 1.0)
    } else if p <= 1.0 - P_LOW {
        let q = p - 0.5;
        let r = q * q;
        (((((A[0] * r + A[1]) * r + A[2]) * r + A[3]) * r + A[4]) * r + A[5]) * q
            / (((((B[0] * r + B[1]) * r + B[2]) * r + B[3]) * r + B[4]) * r + 1.0)
    } else {
        let q = (-2.0 * (1.0 - p).ln()).sqrt();
        -(((((C[0] * q + C[1]) * q + C[2]) * q + C[3]) * q + C[4]) * q + C[5])
            / ((((D[0] * q + D[1]) * q + D[2]) * q + D[3]) * q + 1.0)
    };
    // One Halley refinement against the true CDF.
    let e = 0.5 * libm::erfc(-x / std::f64::consts::SQRT_2) - p;
    let u = e * (2.0 * std::f64::consts::PI).sqrt() * (0.5 * x * x).exp();
    x - u / (1.0 + 0.5 * x * u)
}

#[cfg(test)]
mod tests {
    use super::*;
    use ndarray::array;

    /// γ = 1 on a window of 2π reproduces the standard circular Fourier basis
    /// columns `[1, cos(s), sin(s), …]`.
    #[test]
    fn gamma_one_is_circle_basis() {
        let fam = ClosureFamily::new(2, std::f64::consts::TAU);
        let s = 1.3_f64;
        let (v, _, _) = fam.row_jet(s, 1.0);
        assert!((v[0] - 1.0).abs() < 1e-15);
        assert!((v[1] - s.cos()).abs() < 1e-14);
        assert!((v[2] - s.sin()).abs() < 1e-14);
        assert!((v[3] - (2.0 * s).cos()).abs() < 1e-14);
        assert!((v[4] - (2.0 * s).sin()).abs() < 1e-14);
    }

    /// The analytic γ-jet of the basis matches a central finite difference at
    /// an interior γ and across the γ → 0 Taylor limit.
    #[test]
    fn basis_gamma_jet_matches_fd() {
        let fam = ClosureFamily::new(3, std::f64::consts::TAU);
        let s = 0.8_f64;
        for &g0 in &[1.0_f64, 0.5, 0.05, 1e-6] {
            let (_, dg, dgg) = fam.row_jet(s, g0);
            let h = 1e-5;
            let (vp, _, _) = fam.row_jet(s, g0 + h);
            let (vm, _, _) = fam.row_jet(s, g0 - h);
            let (v0, _, _) = fam.row_jet(s, g0);
            for j in 0..fam.raw_dim() {
                let fd1 = (vp[j] - vm[j]) / (2.0 * h);
                let fd2 = (vp[j] - 2.0 * v0[j] + vm[j]) / (h * h);
                assert!(
                    (dg[j] - fd1).abs() < 1e-5,
                    "d/dγ col {j} at γ={g0}: analytic {} fd {fd1}",
                    dg[j]
                );
                assert!(
                    (dgg[j] - fd2).abs() < 1e-3,
                    "d²/dγ² col {j} at γ={g0}: analytic {} fd {fd2}",
                    dgg[j]
                );
            }
        }
    }

    /// Boundary conductance endpoints and flatness: c(0)=0, c(1)=1, c′(0)=c′(1)=0.
    #[test]
    fn conductance_endpoints_and_flat() {
        let (c0, cp0, _) = boundary_conductance(0.0);
        let (c1, cp1, _) = boundary_conductance(1.0);
        assert!(c0.abs() < 1e-15 && (c1 - 1.0).abs() < 1e-15);
        assert!(cp0.abs() < 1e-15 && cp1.abs() < 1e-15);
    }

    /// The conductance-penalty γ-jet interpolates S_open ⇄ S_circle and its
    /// derivative matches a finite difference.
    #[test]
    fn conductance_penalty_interpolates() {
        let s_open = array![[2.0, -1.0], [-1.0, 2.0]];
        let s_wrap = array![[1.0, -1.0], [-1.0, 1.0]];
        let (s0, _, _) = conductance_penalty_jet(&s_open, &s_wrap, 0.0);
        let (s1, _, _) = conductance_penalty_jet(&s_open, &s_wrap, 1.0);
        assert!((&s0 - &s_open).iter().all(|v| v.abs() < 1e-14));
        let circle = &s_open + &s_wrap;
        assert!((&s1 - &circle).iter().all(|v| v.abs() < 1e-14));

        let g = 0.4;
        let (_, ds, _) = conductance_penalty_jet(&s_open, &s_wrap, g);
        let h = 1e-6;
        let (sp, _, _) = conductance_penalty_jet(&s_open, &s_wrap, g + h);
        let (sm, _, _) = conductance_penalty_jet(&s_open, &s_wrap, g - h);
        let fd = (&sp - &sm).mapv(|v| v / (2.0 * h));
        assert!((&ds - &fd).iter().all(|v| v.abs() < 1e-6));
    }

    /// A profile with a clean parabolic minimum at γ = 0.6 recovers γ̂ and a CI
    /// that excludes both boundaries.
    #[test]
    fn profile_ci_interior_minimum() {
        let v = |g: f64| 100.0 + 50.0 * (g - 0.6).powi(2);
        let grid: Vec<(f64, f64)> = (0..=100)
            .map(|k| k as f64 / 100.0)
            .map(|g| (g, v(g)))
            .collect();
        let ci = profile_ci_from_grid(&grid, 0.95).unwrap();
        assert!((ci.gamma_hat - 0.6).abs() < 0.02, "γ̂ {}", ci.gamma_hat);
        assert!(!ci.ci_includes_circle, "CI hi {}", ci.ci_hi);
        assert!(!ci.ci_includes_interval, "CI lo {}", ci.ci_lo);
        assert!(!ci.singular_boundary);
        // Wilks half-width ≈ sqrt(χ²/(2·50)).
        let want = (chi2_1_quantile(0.95) / (2.0 * 50.0)).sqrt();
        assert!(((ci.ci_hi - ci.ci_lo) / 2.0 - want).abs() < 0.02);
    }

    /// A profile minimised at γ = 1 yields a CI that includes the circle
    /// (closure not rejected) — boundary behaviour is one-sided and honest.
    #[test]
    fn profile_ci_includes_circle_at_boundary() {
        let v = |g: f64| 10.0 + 30.0 * (g - 1.05).powi(2); // min pushed past 1
        let grid: Vec<(f64, f64)> = (0..=100)
            .map(|k| k as f64 / 100.0)
            .map(|g| (g, v(g)))
            .collect();
        let ci = profile_ci_from_grid(&grid, 0.95).unwrap();
        assert!(ci.ci_includes_circle);
        assert!(!ci.singular_boundary);
    }

    /// A profile that keeps improving toward γ = 0 with the floor as the
    /// minimiser flags the singular boundary for the mixture-rung handoff.
    #[test]
    fn profile_flags_singular_boundary() {
        let v = |g: f64| 10.0 + 20.0 * g; // monotone increasing ⇒ min at 0, interior worse
        let grid: Vec<(f64, f64)> = (0..=100)
            .map(|k| k as f64 / 100.0)
            .map(|g| (g, v(g)))
            .collect();
        let ci = profile_ci_from_grid(&grid, 0.95).unwrap();
        assert!((ci.gamma_hat).abs() < 1e-9);
        assert!(ci.singular_boundary);
        assert!(ci.ci_includes_interval);
    }

    /// χ²₁ quantile sanity: the 95% point is ≈ 3.841.
    #[test]
    fn chi2_quantile_known_value() {
        assert!((chi2_1_quantile(0.95) - 3.841_458_820_694_124).abs() < 1e-6);
    }

    // --- Extended-precision (double-double) trig reference --------------------
    // A dependency-free ~32-digit `cos`/`sin` used as TRUTH to certify that the
    // stable recurrence is at least as accurate as the old per-harmonic libm
    // calls. Not a hot path: clarity over speed.

    #[derive(Clone, Copy)]
    struct Dd {
        hi: f64,
        lo: f64,
    }
    fn two_sum(a: f64, b: f64) -> (f64, f64) {
        let s = a + b;
        let bb = s - a;
        (s, (a - (s - bb)) + (b - bb))
    }
    fn two_prod(a: f64, b: f64) -> (f64, f64) {
        let p = a * b;
        (p, a.mul_add(b, -p))
    }
    fn quick_two_sum(a: f64, b: f64) -> (f64, f64) {
        let s = a + b;
        (s, b - (s - a))
    }
    impl Dd {
        fn new(hi: f64) -> Dd {
            Dd { hi, lo: 0.0 }
        }
        fn neg(self) -> Dd {
            Dd { hi: -self.hi, lo: -self.lo }
        }
        fn add(self, o: Dd) -> Dd {
            let (s, e) = two_sum(self.hi, o.hi);
            let (h, l) = quick_two_sum(s, e + self.lo + o.lo);
            Dd { hi: h, lo: l }
        }
        fn sub(self, o: Dd) -> Dd {
            self.add(o.neg())
        }
        fn mul(self, o: Dd) -> Dd {
            let (p, e) = two_prod(self.hi, o.hi);
            let (h, l) = quick_two_sum(p, e + (self.hi * o.lo + self.lo * o.hi));
            Dd { hi: h, lo: l }
        }
        fn mul_f(self, f: f64) -> Dd {
            let (p, e) = two_prod(self.hi, f);
            let (h, l) = quick_two_sum(p, e + self.lo * f);
            Dd { hi: h, lo: l }
        }
        fn to_f64(self) -> f64 {
            self.hi + self.lo
        }
    }
    const DD_PIO2: Dd = Dd {
        hi: 1.5707963267948966,
        lo: 6.123233995736766e-17,
    };
    const DD_TWO_OVER_PI: f64 = 0.6366197723675814;

    fn dd_sincos_small(r: Dd) -> (Dd, Dd) {
        let x2 = r.mul(r);
        let sin_coef: [f64; 8] = [
            1.0,
            -1.0 / 6.0,
            1.0 / 120.0,
            -1.0 / 5040.0,
            1.0 / 362880.0,
            -1.0 / 39916800.0,
            1.0 / 6227020800.0,
            -1.0 / 1307674368000.0,
        ];
        let cos_coef: [f64; 8] = [
            1.0,
            -1.0 / 2.0,
            1.0 / 24.0,
            -1.0 / 720.0,
            1.0 / 40320.0,
            -1.0 / 3628800.0,
            1.0 / 479001600.0,
            -1.0 / 87178291200.0,
        ];
        let mut sin = Dd::new(0.0);
        let mut cos = Dd::new(0.0);
        for k in (0..8).rev() {
            sin = sin.mul(x2).add(Dd::new(sin_coef[k]));
            cos = cos.mul(x2).add(Dd::new(cos_coef[k]));
        }
        (r.mul(sin), cos)
    }

    /// `(sin x, cos x)` in double-double for any real `x`.
    fn dd_sincos(x: Dd) -> (Dd, Dd) {
        let kf = (x.hi * DD_TWO_OVER_PI).round();
        let r = x.sub(DD_PIO2.mul_f(kf));
        let (s, c) = dd_sincos_small(r);
        match (kf as i64).rem_euclid(4) {
            0 => (s, c),
            1 => (c, s.neg()),
            2 => (s.neg(), c.neg()),
            _ => (c.neg(), s),
        }
    }

    /// Exact double-double argument `m·γ·s` (`m` a small integer).
    fn dd_arg(m: usize, gamma: f64, s: f64) -> Dd {
        let (p, e) = two_prod(gamma, s);
        Dd { hi: p, lo: e }.mul_f(m as f64)
    }

    /// The double-double reference itself matches libm to a few ULP at small
    /// and large arguments (a sanity check on the TRUTH used below).
    #[test]
    fn dd_reference_matches_libm_at_small_args() {
        for &t in &[0.3_f64, 1.7, 5.5, 12.25, 123.4] {
            let (s, c) = dd_sincos(Dd::new(t));
            assert!((s.to_f64() - t.sin()).abs() < 1e-14, "sin {t}");
            assert!((c.to_f64() - t.cos()).abs() < 1e-14, "cos {t}");
        }
    }

    /// THE NEW GATE (accuracy, not bits): across 2000 inputs × `H ∈
    /// {4,8,16,32,64,128}`, the stable trigonometric recurrence used by
    /// [`ClosureFamily::write_row_jet`] must be **at least as accurate** vs the
    /// double-double truth as the old per-harmonic libm `sin_cos`. This is the
    /// anti-reward-hack check: the naive 3-term Chebyshev recurrence FAILS here
    /// (3–8× worse at high `H`); the Singleton form passes (~0.7–0.9× = better).
    #[test]
    fn recurrence_is_at_least_as_accurate_as_per_harmonic_libm() {
        let mut seed: u64 = 0x1234_5678_9abc_def0;
        let mut rng = || {
            seed ^= seed << 13;
            seed ^= seed >> 7;
            seed ^= seed << 17;
            (seed >> 11) as f64 / (1u64 << 53) as f64
        };
        for &h in &[4usize, 8, 16, 32, 64, 128] {
            let fam = ClosureFamily::new(h, std::f64::consts::TAU);
            let mut max_old = 0.0f64;
            let mut max_new = 0.0f64;
            for _ in 0..2000 {
                let s = (rng() * 2.0 - 1.0) * std::f64::consts::TAU;
                let gamma = rng();
                let (val, dg, dgg) = fam.row_jet(s, gamma);
                for m in 1..=h {
                    let (ts, tc) = dd_sincos(dd_arg(m, gamma, s));
                    let (tcf, tsf) = (tc.to_f64(), ts.to_f64());
                    let ms = m as f64 * s;
                    let (cs_new, sn_new) = (val[2 * m - 1], val[2 * m]);
                    // OLD: per-harmonic libm on the large argument m·γ·s.
                    let (osn, ocs) = (gamma * ms).sin_cos();
                    // Accuracy gate on the transcendental VALUE channels (cos/sin
                    // are O(1), so absolute ≈ relative). The γ-jet channels are
                    // exact ms/ms² scalings of these — asserted separately below —
                    // so both methods amplify the value error identically and the
                    // value channel is the genuine accuracy comparison.
                    max_old = max_old.max((ocs - tcf).abs().max((osn - tsf).abs()));
                    max_new = max_new.max((cs_new - tcf).abs().max((sn_new - tsf).abs()));
                    // The emitted jet channels must be the EXACT (bit-for-bit)
                    // affine-γ scalings of the emitted value — no extra
                    // transcendental, so they inherit the value accuracy.
                    assert_eq!(dg[2 * m - 1], -sn_new * ms);
                    assert_eq!(dg[2 * m], cs_new * ms);
                    assert_eq!(dgg[2 * m - 1], (-cs_new * ms) * ms);
                    assert_eq!(dgg[2 * m], (-sn_new * ms) * ms);
                }
            }
            // At least as accurate as the old libm form (small platform-libm
            // slack), and inside a tight absolute envelope.
            assert!(
                max_new <= 1.1 * max_old,
                "H={h}: recurrence abs-err {max_new:.3e} worse than per-harmonic libm {max_old:.3e}"
            );
            assert!(max_new < 1e-12, "H={h}: recurrence abs-err {max_new:.3e} exceeds 1e-12");
        }
    }

    /// The four-rows-per-pass `f64x4` assembly in `design`/`design_jet` must be
    /// **bit-identical** to the scalar single-row path it replaces — each SIMD
    /// lane is plain IEEE `f64`, so there is no accuracy change, only throughput.
    /// Covers non-multiple-of-4 row counts (the scalar remainder) and `H = 0`.
    #[test]
    fn simd_design_is_bit_identical_to_scalar_rows() {
        for &h in &[0usize, 1, 3, 7, 16] {
            let fam = ClosureFamily::new(h, std::f64::consts::TAU);
            // n deliberately not a multiple of 4 to exercise the remainder.
            let n = 11;
            let s: Vec<f64> = (0..n).map(|k| (k as f64) * 0.37 - 1.9).collect();
            let sv = ndarray::ArrayView1::from(&s);
            let gamma = 0.61;
            let phi = fam.design(sv, gamma);
            let (pj, dj, ddj) = fam.design_jet(sv, gamma);
            for (i, &si) in s.iter().enumerate() {
                let (v, dgr, ddr) = fam.row_jet(si, gamma);
                for j in 0..fam.raw_dim() {
                    assert_eq!(phi[[i, j]].to_bits(), v[j].to_bits(), "design v ({i},{j})");
                    assert_eq!(pj[[i, j]].to_bits(), v[j].to_bits(), "design_jet v ({i},{j})");
                    assert_eq!(dj[[i, j]].to_bits(), dgr[j].to_bits(), "design_jet dγ ({i},{j})");
                    assert_eq!(ddj[[i, j]].to_bits(), ddr[j].to_bits(), "design_jet d²γ ({i},{j})");
                }
            }
        }
    }
}