integral-core 0.1.1

//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
//!
//! A **vertical recurrence (VRR)** builds the
//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
//! the primitives are **contracted** into AO space, and a **horizontal recurrence
//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
//! Doing the HRR after contraction is the HGP "early contraction" trick: the
//! geometry-only HRR runs **once per shell quartet** instead of once per
//! primitive quartet, which is the win at high contraction degree.
//!
//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
//! against it (and against an independent McMurchie–Davidson path).
//!
//! ## Method
//!
//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
//!
//! ```text
//!   [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
//! ```
//!
//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
//!
//! ```text
//!   [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
//!       + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
//!       + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
//!   [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
//!       + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
//!       + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
//! ```
//!
//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
//! Cartesian shell block:
//!
//! ```text
//!   (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0)   [bra, A→B]
//!   (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d)   [ket, C→D]
//! ```
//!
//! ## Buffers (see `DESIGN_NOTES.md` D3, D12)
//!
//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
//! `(ii|ii)`. The current engine (D12) removes that:
//!
//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
//!   `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
//!   of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
//!   index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
//!   `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
//!   / [`cart_index`] with two rolling degree layers, replacing the former HashMap
//!   memoisation.
//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
//!   across quartets (thread-local by default), not re-allocated per quartet.
//!
//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
//! former full-table engine's values, cross-checked against the Rys engine and an
//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).

use std::cell::RefCell;

use integral_math::am::{cart_components, cart_index, n_cart};
use integral_math::boys::boys_array;

use crate::os::{Vec3, MAX_L};

/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
/// momentum, and primitive `(exponent, effective-coefficient)` data. The
/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
/// engine itself works on un-normalised monomials and only multiplies the four
/// coefficients into the contracted accumulator.
#[derive(Debug, Clone, Copy)]
pub struct ShellRef<'a> {
    /// Shell centre (bohr).
    pub center: Vec3,
    /// Angular momentum.
    pub l: usize,
    /// Primitive exponents.
    pub exps: &'a [f64],
    /// Effective contraction coefficients, aligned with `exps`.
    pub coeffs: &'a [f64],
}

/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
#[inline]
fn n_addr(lmax: usize) -> usize {
    (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
}

/// Number of Cartesian triples of degree **strictly less than** `d`:
/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
#[inline]
fn tri_below(d: usize) -> usize {
    d * (d + 1) * (d + 2) / 6
}

/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
/// `0..n_addr(deg)`.
#[inline]
fn addr(t: [usize; 3]) -> usize {
    let n = t[0] + t[1] + t[2];
    let base = n * (n + 1) * (n + 2) / 6;
    // within-degree index: (n-lx)(n-lx+1)/2 + lz
    let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
    base + within
}

/// Reusable scratch arena for the OS/HGP ERI engine.
///
/// Holds the engine's working buffers so they are allocated **once and reused**
/// across shell quartets, instead of a fresh heap allocation per quartet inside the
/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
/// thread uses its own instance — the thread-local default behind
/// [`coulomb_shell_into`], or one passed explicitly to
/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
/// error).
///
/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
/// they are later read, so they need no functional zeroing — but in debug builds
/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
/// rather than silently reading a stale value from a previous quartet. Results are
/// therefore independent of evaluation order and of arena reuse (asserted by
/// `tests/arena.rs`).
#[derive(Debug, Default, Clone)]
pub struct EriScratch {
    /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
    levels: Vec<f64>,
    /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
    c_ef: Vec<f64>,
    /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
    bra: Vec<f64>,
    /// HRR bra rolling `b`-degree layers.
    bra_prev: Vec<f64>,
    bra_cur: Vec<f64>,
    /// HRR ket rolling `d`-degree layers.
    ket_prev: Vec<f64>,
    ket_cur: Vec<f64>,
    /// Precomputed bra/ket primitive-pair data (combined exponent, product
    /// centre, `K` prefactor, coeff product) — built once per shell quartet so
    /// the per-pair `exp`/centre work is not repeated inside the deep primitive
    /// loop. Reused across quartets like the other buffers.
    bra_pairs: Vec<PrimPair>,
    ket_pairs: Vec<PrimPair>,
    /// Shared Cartesian-triple table `tri_all[d] = cart_components(d)` for every
    /// degree the engine needs (`0..=2·MAX_L`). Built once and sliced for the VRR
    /// `e`/`f` lists and the HRR triples, instead of re-`collect`ing per quartet.
    tri_all: Vec<Vec<[usize; 3]>>,
    /// Flat VRR offset table: `eoff[k·n_e + ae]` is `e`'s packed offset in level
    /// `k`. Rebuilt per quartet into this reused buffer (no per-quartet `Vec<Vec>`
    /// allocation); `slab[k]` is one `f`-component's packed size for level `k`.
    eoff: Vec<usize>,
    slab: Vec<usize>,
}

/// One primitive pair's reusable geometry/prefactor, shared by every primitive
/// quartet that uses it: the combined exponent `zeta = α+β`, the Gaussian product
/// centre, the overlap prefactor `K = exp(−αβ/ζ·|A−B|²)`, and the two contraction
/// coefficients (`c1` outer, `c2` inner).
///
/// The coefficients are kept **un-multiplied** so the contracted `scale` can be
/// formed in exactly the former evaluation order `((c_a·c_b)·c_c)·c_d` — making the
/// engine's output bit-identical to the pre-precomputation code (the per-element
/// and 8-fold-symmetry tests are floorless-relative and sensitive to the last ULP
/// on near-cancellation elements).
#[derive(Debug, Clone, Copy, Default)]
struct PrimPair {
    zeta: f64,
    center: Vec3,
    kappa: f64,
    c1: f64,
    c2: f64,
    /// `1/(2ζ)` — the VRR `inv_2p`(bra)/`inv_2q`(ket) coefficient. Pure per-pair.
    inv_2zeta: f64,
    /// `centre − s1.centre` — the VRR `P−A`(bra)/`Q−C`(ket) shift. Pure per-pair.
    r1: Vec3,
}

/// Fill `out` with the [`PrimPair`] data for every primitive combination of two
/// shells (outer `s1`, inner `s2`), matching the former in-loop computation
/// element-for-element. The inter-centre distance is hoisted out of the pair loop.
/// `inv_2zeta` and `r1` are the pair-only VRR coefficients (`1/2ζ` and the `P−A` /
/// `Q−C` shift) that the kernel formerly recomputed on every primitive quartet.
fn build_pairs(out: &mut Vec<PrimPair>, s1: ShellRef<'_>, s2: ShellRef<'_>) {
    out.clear();
    out.reserve(s1.exps.len() * s2.exps.len());
    let d2 = dist2(s1.center, s2.center);
    for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
        for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
            let zeta = e1 + e2;
            let center = combine(e1, s1.center, e2, s2.center, zeta);
            out.push(PrimPair {
                zeta,
                center,
                kappa: (-(e1 * e2 / zeta) * d2).exp(),
                c1,
                c2,
                inv_2zeta: 0.5 / zeta,
                r1: sub(center, s1.center),
            });
        }
    }
}

impl EriScratch {
    /// A fresh, empty arena; it grows to fit on first use.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Total `f64` elements currently held across all buffers — the resident
    /// working set, for memory reporting/tests.
    #[must_use]
    pub fn resident_f64(&self) -> usize {
        self.levels.len()
            + self.c_ef.len()
            + self.bra.len()
            + self.bra_prev.len()
            + self.bra_cur.len()
            + self.ket_prev.len()
            + self.ket_cur.len()
    }

    /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
    /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
    /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
    #[must_use]
    pub fn largest_buffer_f64(&self) -> usize {
        [
            self.levels.len(),
            self.c_ef.len(),
            self.bra.len(),
            self.bra_prev.len(),
            self.bra_cur.len(),
            self.ket_prev.len(),
            self.ket_cur.len(),
        ]
        .into_iter()
        .max()
        .unwrap_or(0)
    }
}

thread_local! {
    /// Per-thread default arena backing [`coulomb_shell_into`].
    static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
}

/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
#[inline]
fn ensure_len(v: &mut Vec<f64>, n: usize) {
    if v.len() < n {
        v.resize(n, 0.0);
    }
}

/// `ensure_len` for the `usize` offset/slab buffers.
#[inline]
fn ensure_usize(v: &mut Vec<usize>, n: usize) {
    if v.len() < n {
        v.resize(n, 0);
    }
}

/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
///
/// The engine owns the primitive-quartet loop (the HGP contraction happens between
/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
/// explicit control (e.g. one arena per worker thread) call
/// [`coulomb_shell_into_scratch`].
///
/// # Panics
/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
pub fn coulomb_shell_into(
    a: ShellRef<'_>,
    b: ShellRef<'_>,
    c: ShellRef<'_>,
    d: ShellRef<'_>,
    out: &mut [f64],
) {
    ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
}

/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
/// compile error); the result is bit-identical regardless of which arena is used or
/// what it last held.
///
/// # Panics
/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
pub fn coulomb_shell_into_scratch(
    scratch: &mut EriScratch,
    a: ShellRef<'_>,
    b: ShellRef<'_>,
    c: ShellRef<'_>,
    d: ShellRef<'_>,
    out: &mut [f64],
) {
    let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
    debug_assert!(
        la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
        "angular momentum exceeds MAX_L"
    );
    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
    debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");

    let ne = la + lb; // max bra (A-side) degree built by VRR
    let nf = lc + ld; // max ket (C-side) degree built by VRR
    let l_total = ne + nf;
    let n_e = n_addr(ne);
    let n_f = n_addr(nf);

    let EriScratch {
        levels,
        c_ef,
        bra,
        bra_prev,
        bra_cur,
        ket_prev,
        ket_cur,
        bra_pairs,
        ket_pairs,
        tri_all,
        eoff,
        slab,
    } = scratch;

    // Fast path: (ss|ss). With no angular momentum to raise, the whole quartet is a
    // single contracted [00|00]^(0). Skip the VRR/HRR machinery (the dead raise
    // coefficients, the `levels` buffer round-trip, the per-primitive call) and sum
    // straight into a register. Bit-identical to the general path: same `pref`/`t`
    // expressions, same `((c_a·c_b)·c_c)·c_d` scale, same `scale·(pref·F_0)` term,
    // same bra-outer/ket-inner order — just without the scaffolding.
    if l_total == 0 {
        build_pairs(bra_pairs, a, b);
        build_pairs(ket_pairs, c, d);
        let two_pi_2_5 =
            2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
        let mut acc = 0.0;
        let mut f0 = [0.0f64; 1];
        for bra in bra_pairs.iter() {
            let bc = bra.c1 * bra.c2;
            for ket in ket_pairs.iter() {
                let (p, q) = (bra.zeta, ket.zeta);
                let pq = p + q;
                let t = (p * q / pq) * dist2(bra.center, ket.center);
                let pref = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
                boys_array(0, t, &mut f0);
                acc += (bc * ket.c1) * ket.c2 * (pref * f0[0]);
            }
        }
        out[0] += acc;
        return;
    }

    // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
    ensure_len(c_ef, n_e * n_f);
    c_ef[..n_e * n_f].fill(0.0);

    // Shared Cartesian-triple table, built once (degrees `0..=2·MAX_L` cover every
    // `ne`/`nf`/`max(ne,nf)` the engine reaches) and sliced below — no per-quartet
    // `cart_components` allocation.
    if tri_all.len() <= 2 * MAX_L {
        *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
    }

    // Precompute the bra/ket primitive-pair data once (the `exp` prefactor and
    // product centre depend only on the pair, not the quartet), then loop over the
    // pair lists. The previous code recomputed the ket pair's `exp`/centre on every
    // bra primitive — `na·nb` times too many. Bra-outer / ket-inner preserves the
    // exact accumulation order into `c_ef`.
    build_pairs(bra_pairs, a, b);
    build_pairs(ket_pairs, c, d);

    // VRR. The small classes (`ne, nf ≤ 2` — l_total ≤ 4, which carries ~87% of a
    // cc-pVDZ-style build) dispatch to the monomorphized kernels, where the whole
    // structural walk (triples, addresses, branches, m-ranges) is resolved at
    // compile time; everything else runs the general m-marching `vrr_primitive`.
    // `(0,0)` already returned through the (ss|ss) fast path above.
    match (ne, nf) {
        (0, 1) => contract_class::<0, 1>(bra_pairs, ket_pairs, c_ef),
        (1, 0) => contract_class::<1, 0>(bra_pairs, ket_pairs, c_ef),
        (1, 1) => contract_class::<1, 1>(bra_pairs, ket_pairs, c_ef),
        (0, 2) => contract_class::<0, 2>(bra_pairs, ket_pairs, c_ef),
        (2, 0) => contract_class::<2, 0>(bra_pairs, ket_pairs, c_ef),
        (1, 2) => contract_class::<1, 2>(bra_pairs, ket_pairs, c_ef),
        (2, 1) => contract_class::<2, 1>(bra_pairs, ket_pairs, c_ef),
        (2, 2) => contract_class::<2, 2>(bra_pairs, ket_pairs, c_ef),
        _ => {
            // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
            // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
            // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
            // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
            // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
            // packed size and `eoff[k·n_e + ae]` is `e`'s offset within it (flat, reused).
            ensure_usize(slab, nf + 1);
            ensure_usize(eoff, (nf + 1) * n_e);
            // `k` indexes both `slab` and the flat `eoff` block base `k·n_e`, so a range
            // loop is the clear form here.
            #[allow(clippy::needless_range_loop)]
            for k in 0..=nf {
                let base = k * n_e;
                let mut run = 0usize;
                let mut ae = 0usize;
                for de in 0..=ne {
                    let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
                    for _ in 0..n_cart(de) {
                        eoff[base + ae] = run;
                        ae += 1;
                        run += mlen;
                    }
                }
                slab[k] = run;
            }
            let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
            ensure_len(levels, 3 * maxlevel);
            // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
            // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
            // instead of silently using a previous quartet's value.
            #[cfg(debug_assertions)]
            levels[..3 * maxlevel].fill(f64::NAN);

            // Reborrow the offset/triple buffers as shared slices for the read-only kernel.
            let (eoff_s, slab_s, tri_s) = (&eoff[..], &slab[..], &tri_all[..]);
            for bra in bra_pairs.iter() {
                let bra_coef = bra.c1 * bra.c2; // = c_a·c_b, hoisted out of the ket loop
                for ket in ket_pairs.iter() {
                    // ((c_a·c_b)·c_c)·c_d — the former left-to-right product, bit for bit.
                    let scale = (bra_coef * ket.c1) * ket.c2;
                    vrr_primitive(
                        bra.zeta,
                        ket.zeta,
                        bra.center,
                        ket.center,
                        bra.kappa,
                        ket.kappa,
                        bra.r1,
                        ket.r1,
                        bra.inv_2zeta,
                        ket.inv_2zeta,
                        ne,
                        nf,
                        l_total,
                        n_e,
                        n_f,
                        maxlevel,
                        eoff_s,
                        slab_s,
                        tri_s,
                        levels,
                        scale,
                        c_ef,
                    );
                }
            }
        }
    }

    // HRR in contracted space, then scatter the block.
    let ab = sub(a.center, b.center); // A − B
    let cd = sub(c.center, d.center); // C − D
    hrr_and_scatter(
        la,
        lb,
        lc,
        ld,
        n_f,
        c_ef,
        ab,
        cd,
        out,
        bra,
        bra_prev,
        bra_cur,
        ket_prev,
        ket_cur,
        &tri_all[..],
    );
}

/// Cartesian triples of degree 1 and 2 in [`cart_components`] order, as `const`
/// tables so the monomorphized small-class VRR kernels fully unroll over them
/// (the structural walk — axes, addresses, `has2`/cross branches — then resolves
/// at compile time instead of being re-derived on every primitive quartet).
const TRI1: [[usize; 3]; 3] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]];
const TRI2: [[usize; 3]; 6] = [
    [2, 0, 0],
    [1, 1, 0],
    [1, 0, 1],
    [0, 2, 0],
    [0, 1, 1],
    [0, 0, 2],
];
/// The e-components `0..n_addr(2)` in [`addr`] order with their degree —
/// the bra index set of the small-class kernels (`NE ≤ 2`).
const E_COMP: [([usize; 3], usize); 10] = [
    ([0, 0, 0], 0),
    ([1, 0, 0], 1),
    ([0, 1, 0], 1),
    ([0, 0, 1], 1),
    ([2, 0, 0], 2),
    ([1, 1, 0], 2),
    ([1, 0, 1], 2),
    ([0, 2, 0], 2),
    ([0, 1, 1], 2),
    ([0, 0, 2], 2),
];

/// Contract a whole shell quartet of VRR shape `(NE, NF)` (with `NE, NF ≤ 2`)
/// into `c_ef` using the monomorphized small-class kernel [`vrr_small`].
///
/// Same bra-outer / ket-inner primitive loop and the same left-to-right
/// `((c_a·c_b)·c_c)·c_d` scale as the general path, so the accumulation into
/// `c_ef` is bit-identical. The VRR working tables live here (overwritten per
/// primitive quartet in the region read, like the general path's `levels`
/// arena) so their one-time zero-init is per shell quartet, not per primitive.
fn contract_class<const NE: usize, const NF: usize>(
    bra_pairs: &[PrimPair],
    ket_pairs: &[PrimPair],
    c_ef: &mut [f64],
) {
    // Worst-case (NE,NF)=(2,2) sizes: e-table 10 components × m ∈ 0..=4;
    // f-degree-1 level 10×3 × m ∈ 0..=3; f-degree-2 level 10×6 × m ∈ 0..=2.
    let mut ea = [0.0f64; 50];
    let mut eb1 = [0.0f64; 120];
    let mut eb2 = [0.0f64; 180];
    for bra in bra_pairs {
        let bra_coef = bra.c1 * bra.c2;
        for ket in ket_pairs {
            let scale = (bra_coef * ket.c1) * ket.c2;
            vrr_small::<NE, NF>(bra, ket, scale, &mut ea, &mut eb1, &mut eb2, c_ef);
        }
    }
}

/// One primitive quartet of the OS VRR for the small classes `NE, NF ≤ 2`,
/// monomorphized per `(NE, NF)` — the per-L-class "generated kernel" idea of
/// libcint/libint expressed through const generics: every loop bound, Cartesian
/// triple, address, and `has2`/cross branch below is a compile-time constant in
/// each instantiation, so the emitted code is a straight unrolled FMA chain.
///
/// **Bit-identical to [`vrr_primitive`] by construction**: the same recurrence
/// expressions with the same term order and association — base
/// `pref·F_m`, raise `pa_i·s1[m] + wp_i·s1[m+1]`, then `+= coef2·(s2[m] −
/// (q/pq)·s2[m+1])`, then `+= cross·s3[m+1]` — the same m-ranges
/// (`m ≤ l_total − de − k`), and the same `c_ef[ae·n_f + af] += scale·v`
/// extraction. Only the storage differs (fixed per-degree tables instead of the
/// triangle-packed rolling window), which does not touch the arithmetic.
/// Guarded by the full-tensor XOR fingerprint and the golden/cross-engine tests.
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn vrr_small<const NE: usize, const NF: usize>(
    bra: &PrimPair,
    ket: &PrimPair,
    scale: f64,
    ea: &mut [f64; 50],
    eb1: &mut [f64; 120],
    eb2: &mut [f64; 180],
    c_ef: &mut [f64],
) {
    let lt = NE + NF;
    let n_e = n_addr(NE);
    let n_f = n_addr(NF);
    let (p, q) = (bra.zeta, ket.zeta);
    let (pc, qc) = (bra.center, ket.center);
    let pa = bra.r1; // P − A
    let qcen = ket.r1; // Q − C
    let (inv_2p, inv_2q) = (bra.inv_2zeta, ket.inv_2zeta);

    // Scalar header — identical expressions to `vrr_primitive`.
    let pq = p + q;
    let rho = p * q / pq;
    let pq_vec = sub(pc, qc); // P − Q
    let t = rho * norm2(pq_vec);
    let w = [
        (p * pc[0] + q * qc[0]) / pq,
        (p * pc[1] + q * qc[1]) / pq,
        (p * pc[2] + q * qc[2]) / pq,
    ];
    let wp = sub(w, pc); // W − P
    let wq = sub(w, qc); // W − Q
    use std::f64::consts::PI;
    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
    let mut fm = [0.0f64; 5];
    boys_array(lt, t, &mut fm[..=lt]);
    for m in 0..=lt {
        ea[m] = pref * fm[m];
    }

    let inv_2pq = 0.5 / pq;
    let q_over_pq = q / pq;
    let p_over_pq = p / pq;

    // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
    if NE >= 1 {
        for (iw, te) in TRI1.iter().enumerate() {
            let i = lower_axis(*te);
            // Source is [0,0,0] at addr 0; no has2 term at degree 1.
            for m in 0..=(lt - 1) {
                ea[(1 + iw) * 5 + m] = pa[i] * ea[m] + wp[i] * ea[m + 1];
            }
        }
    }
    if NE >= 2 {
        for (iw, te) in TRI2.iter().enumerate() {
            let i = lower_axis(*te);
            let s1 = addr(dec(*te, i));
            let has2 = te[i] >= 2;
            let coef2 = ((te[i] - 1) as f64) * inv_2p;
            let s2 = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
            for m in 0..=(lt - 2) {
                let mut v = pa[i] * ea[s1 * 5 + m] + wp[i] * ea[s1 * 5 + m + 1];
                if has2 {
                    v += coef2 * (ea[s2 * 5 + m] - q_over_pq * ea[s2 * 5 + m + 1]);
                }
                ea[(4 + iw) * 5 + m] = v;
            }
        }
    }
    // Extract f-degree 0: contract m = 0 of every e-component.
    for ae in 0..n_e {
        c_ef[ae * n_f] += scale * ea[ae * 5];
    }

    // Phase B — ket raise, f-degree 1: m ≤ lt − de − 1.
    if NF >= 1 {
        for (fw, tf) in TRI1.iter().enumerate() {
            let j = lower_axis(*tf);
            // Source f-component is [0,0,0] (level 0 = the phase-A table).
            for ae in 0..n_e {
                let (te, de) = E_COMP[ae];
                let has_cross = te[j] >= 1;
                let cross_coef = te[j] as f64 * inv_2pq;
                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
                for m in 0..=(lt - de - 1) {
                    let mut v = qcen[j] * ea[ae * 5 + m] + wq[j] * ea[ae * 5 + m + 1];
                    if has_cross {
                        v += cross_coef * ea[cs * 5 + m + 1];
                    }
                    eb1[(ae * 3 + fw) * 4 + m] = v;
                }
            }
        }
        for ae in 0..n_e {
            for fw in 0..3 {
                c_ef[ae * n_f + 1 + fw] += scale * eb1[(ae * 3 + fw) * 4];
            }
        }
    }

    // Phase B — ket raise, f-degree 2: m ≤ lt − de − 2.
    if NF >= 2 {
        for (fw, tf) in TRI2.iter().enumerate() {
            let j = lower_axis(*tf);
            let f1 = dec(*tf, j);
            let lf1 = cart_index(f1); // f-component index in the degree-1 level
            let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
            for ae in 0..n_e {
                let (te, de) = E_COMP[ae];
                let has_cross = te[j] >= 1;
                let cross_coef = te[j] as f64 * inv_2pq;
                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
                for m in 0..=(lt - de - 2) {
                    let mut v = qcen[j] * eb1[(ae * 3 + lf1) * 4 + m]
                        + wq[j] * eb1[(ae * 3 + lf1) * 4 + m + 1];
                    if has2 {
                        v += coef2 * (ea[ae * 5 + m] - p_over_pq * ea[ae * 5 + m + 1]);
                    }
                    if has_cross {
                        v += cross_coef * eb1[(cs * 3 + lf1) * 4 + m + 1];
                    }
                    eb2[(ae * 6 + fw) * 3 + m] = v;
                }
            }
        }
        for ae in 0..n_e {
            for fw in 0..6 {
                c_ef[ae * n_f + 4 + fw] += scale * eb2[(ae * 6 + fw) * 3];
            }
        }
    }
}

/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
///
/// Algebraically identical to the former full-table VRR (same OS recurrences, same
/// term order) — only the storage changes, so it is bit-identical
/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
/// buffer is recycled.
#[allow(clippy::too_many_arguments)]
fn vrr_primitive(
    p: f64,
    q: f64,
    pc: Vec3,
    qc: Vec3,
    kab: f64,
    kcd: f64,
    pa: Vec3,    // P − A  (bra pair, precomputed)
    qcen: Vec3,  // Q − C  (ket pair, precomputed)
    inv_2p: f64, // 1/(2p) (bra pair, precomputed)
    inv_2q: f64, // 1/(2q) (ket pair, precomputed)
    ne: usize,
    nf: usize,
    l_total: usize,
    n_e: usize,
    n_f: usize,
    maxlevel: usize,
    eoff: &[usize],
    slab: &[usize],
    tri: &[Vec<[usize; 3]>],
    levels: &mut [f64],
    scale: f64,
    c_ef: &mut [f64],
) {
    let pq = p + q;
    let rho = p * q / pq;
    let pq_vec = sub(pc, qc); // P − Q
    let t = rho * norm2(pq_vec);
    let w = [
        (p * pc[0] + q * qc[0]) / pq,
        (p * pc[1] + q * qc[1]) / pq,
        (p * pc[2] + q * qc[2]) / pq,
    ];
    // `pa` (P−A) and `qcen` (Q−C) are now passed in (pure per-pair).
    let wp = sub(w, pc); // W − P
    let wq = sub(w, qc); // W − Q

    // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
    // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
    // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
    // source and destination across the distinct slots `k`, `k−1`, `k−2`.
    let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
        (k % 3) * maxlevel + lf * slab[k] + eoff[k * n_e + ae] + m
    };

    // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
    // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
    // bound; an ERI couples both electrons).
    use std::f64::consts::PI;
    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
    let mut fm = [0.0f64; 4 * MAX_L + 1];
    boys_array(l_total, t, &mut fm[..=l_total]);
    for m in 0..=l_total {
        levels[off(0, 0, 0, m)] = pref * fm[m];
    }

    // `inv_2p` / `inv_2q` are now passed in (pure per-pair).
    let inv_2pq = 0.5 / pq;
    let q_over_pq = q / pq;
    let p_over_pq = p / pq;

    // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
    // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
    // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
    let eoff0 = &eoff[..n_e];
    for (na, te_list) in tri.iter().enumerate().take(ne + 1).skip(1) {
        for &te in te_list {
            let i = lower_axis(te);
            let s1a = addr(dec(te, i));
            let mmax = l_total - na;
            let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
            let has2 = te[i] >= 2;
            let s1_0 = eoff0[s1a];
            let s2_0 = if has2 {
                eoff0[addr(dec(dec(te, i), i))]
            } else {
                0
            };
            let dst_0 = eoff0[addr(te)];
            for m in 0..=mmax {
                let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
                if has2 {
                    v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
                }
                levels[dst_0 + m] = v;
            }
        }
    }
    // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
    for ae in 0..n_e {
        c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
    }

    // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
    // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
    // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
    // the former full-table inner-loop cost. The k−2 level is read only when the axis
    // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
    for k in 1..=nf {
        let slot_k = (k % 3) * maxlevel;
        let slot_k1 = ((k - 1) % 3) * maxlevel;
        let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
        let eoff_k = &eoff[k * n_e..];
        let eoff_k1 = &eoff[(k - 1) * n_e..];
        let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
            (
                ((k - 2) % 3) * maxlevel,
                slab[k - 2],
                &eoff[(k - 2) * n_e..],
            )
        } else {
            (0, 0, &[])
        };
        for &tf in &tri[k] {
            let j = lower_axis(tf);
            let f1 = dec(tf, j);
            let lf1 = cart_index(f1); // local index in level k−1
            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
            let has2 = tf[j] >= 2;
            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
            let lf = cart_index(tf); // local index in level k
            let sk = slot_k + lf * slab_k;
            let sk1 = slot_k1 + lf1 * slab_k1;
            let sk2 = slot_k2 + lf2 * slab_k2;
            for (nadeg, te_list) in tri.iter().enumerate().take(ne + 1) {
                for &te in te_list {
                    let ea = addr(te);
                    // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
                    let has_cross = te[j] >= 1;
                    let (cross_coef, cross_0) = if has_cross {
                        (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
                    } else {
                        (0.0, 0)
                    };
                    let mmax = l_total - nadeg - k;
                    let dst_0 = sk + eoff_k[ea];
                    let src1_0 = sk1 + eoff_k1[ea];
                    let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
                    for m in 0..=mmax {
                        let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
                        if has2 {
                            v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
                        }
                        if has_cross {
                            v += cross_coef * levels[cross_0 + m + 1];
                        }
                        levels[dst_0 + m] = v;
                    }
                }
            }
        }
        // Extract level k: contract each f-component's m = 0 slice into c_ef.
        for &tf in &tri[k] {
            let lf = cart_index(tf);
            let af = addr(tf);
            for ae in 0..n_e {
                c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
            }
        }
    }
}

/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
/// the row-major output block.
///
/// Flat-array HGP horizontal recurrence, replacing the earlier
/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
/// term order — so it is **bit-identical** to the recursive version (guarded by
/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
///
/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
/// independently per ket index `f` (a spectator), into the `bra` intermediate.
/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
#[allow(clippy::too_many_arguments)]
fn hrr_and_scatter<'a>(
    la: usize,
    lb: usize,
    lc: usize,
    ld: usize,
    n_f: usize,
    c_ef: &[f64],
    ab: Vec3,
    cd: Vec3,
    out: &mut [f64],
    bra: &mut Vec<f64>,
    mut prev: &'a mut Vec<f64>,
    mut cur: &'a mut Vec<f64>,
    mut kprev: &'a mut Vec<f64>,
    mut kcur: &'a mut Vec<f64>,
    // Shared Cartesian-triple table (degrees `0..=2·MAX_L`); HRR indexes it up to
    // `max(ne, nf)`. Passed in so it is not re-`collect`ed per quartet.
    tri: &[Vec<[usize; 3]>],
) {
    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
    let ne = la + lb; // max bra (A-side) degree present in c_ef
    let nf = lc + ld; // max ket (C-side) degree present in c_ef
    let n_e = n_addr(ne);

    // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
    // the ket base are keyed by the global `addr(f)` offset by `f_base`.
    let f_base = tri_below(lc);
    let nf_range = n_f - f_base;

    // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
    // Reused arena buffers; bra is fully overwritten below, the rolling
    // layers in the region read — debug NaN-fill bra so a missed write surfaces.
    let bra_len = na * nb * nf_range;
    ensure_len(bra, bra_len);
    #[cfg(debug_assertions)]
    bra[..bra_len].fill(f64::NAN);
    // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
    let layer_len = n_cart(lb) * n_e;
    ensure_len(prev, layer_len);
    ensure_len(cur, layer_len);
    for f_global in f_base..n_f {
        let jf = f_global - f_base;
        // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
        for &a in tri[la..=ne].iter().flatten() {
            let ae = addr(a);
            prev[ae] = c_ef[ae * n_f + f_global];
        }
        for kb in 1..=lb {
            for (ibw, &b) in tri[kb].iter().enumerate() {
                let i = lower_axis(b);
                let b1w = cart_index(dec(b, i));
                // a-degrees that can still reach (la, lb): [la..=ne−kb].
                for &a in tri[la..=(ne - kb)].iter().flatten() {
                    let ae = addr(a);
                    let a1e = addr(inc(a, i));
                    cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
                }
            }
            std::mem::swap(&mut prev, &mut cur);
        }
        // `prev` now holds b-degree lb (or the base when lb = 0): extract.
        for (ib, &b) in tri[lb].iter().enumerate() {
            let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
            for (ia, &a) in tri[la].iter().enumerate() {
                bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
            }
        }
    }

    // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
    let klayer_len = n_cart(ld) * n_f;
    ensure_len(kprev, klayer_len);
    ensure_len(kcur, klayer_len);
    for ia in 0..na {
        for ib in 0..nb {
            let brarow = (ia * nb + ib) * nf_range;
            // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
            for &c in tri[lc..=nf].iter().flatten() {
                let ce = addr(c);
                kprev[ce] = bra[brarow + (ce - f_base)];
            }
            for kd in 1..=ld {
                for (idw, &d) in tri[kd].iter().enumerate() {
                    let j = lower_axis(d);
                    let d1w = cart_index(dec(d, j));
                    for &c in tri[lc..=(nf - kd)].iter().flatten() {
                        let ce = addr(c);
                        let c1e = addr(inc(c, j));
                        kcur[idw * n_f + ce] =
                            kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
                    }
                }
                std::mem::swap(&mut kprev, &mut kcur);
            }
            // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
            for (ic, &c) in tri[lc].iter().enumerate() {
                let ce = addr(c);
                for id in 0..nd {
                    out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
                }
            }
        }
    }
}

// --- small helpers ---

/// First nonzero axis of a triple (the lowering direction).
#[inline]
fn lower_axis(t: [usize; 3]) -> usize {
    if t[0] > 0 {
        0
    } else if t[1] > 0 {
        1
    } else {
        2
    }
}

#[inline]
fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
    t[i] -= 1;
    t
}

#[inline]
fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
    t[i] += 1;
    t
}

#[inline]
fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
    [
        (a * ca[0] + b * cb[0]) / p,
        (a * ca[1] + b * cb[1]) / p,
        (a * ca[2] + b * cb[2]) / p,
    ]
}

#[inline]
fn sub(u: Vec3, v: Vec3) -> Vec3 {
    [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
}

#[inline]
fn dist2(u: Vec3, v: Vec3) -> f64 {
    norm2(sub(u, v))
}

#[inline]
fn norm2(u: Vec3) -> f64 {
    u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Single-primitive `ShellRef` helper.
    fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
        (center, l, [exp], [1.0])
    }

    /// (ss|ss) over four unit s primitives must equal the closed form
    /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
    /// engine passes, so the two share a verified base case.
    #[test]
    fn ssss_matches_closed_form() {
        let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
        let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
        let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
        let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
        let mut out = [0.0; 1];
        coulomb_shell_into(
            ShellRef {
                center: ac,
                l: al,
                exps: &ae,
                coeffs: &acf,
            },
            ShellRef {
                center: bc,
                l: bl,
                exps: &be,
                coeffs: &bcf,
            },
            ShellRef {
                center: cc,
                l: cl,
                exps: &ce,
                coeffs: &ccf,
            },
            ShellRef {
                center: dc,
                l: dl,
                exps: &de,
                coeffs: &dcf,
            },
            &mut out,
        );

        let p = 0.8 + 1.3;
        let q = 0.5 + 1.1;
        let pcen = combine(0.8, ac, 1.3, bc, p);
        let qcen = combine(0.5, cc, 1.1, dc, q);
        let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
        let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
        let rho = p * q / (p + q);
        let t = rho * dist2(pcen, qcen);
        let mut fm = [0.0; 1];
        boys_array(0, t, &mut fm);
        use std::f64::consts::PI;
        let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
        assert!(
            (out[0] - expect).abs() < 1e-14 * expect.abs(),
            "ssss {} vs {}",
            out[0],
            expect
        );
    }

    use crate::os::Prim;
    use crate::rys::coulomb_into;

    /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
    /// center)`, for cross-checking against the Rys engine.
    #[allow(clippy::too_many_arguments)]
    fn os_block(
        la: usize,
        ea: f64,
        ca: Vec3,
        lb: usize,
        eb: f64,
        cb: Vec3,
        lc: usize,
        recc: f64,
        ccc: Vec3,
        ld: usize,
        ed: f64,
        cdd: Vec3,
    ) -> Vec<f64> {
        let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
        let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
        let one = [1.0];
        coulomb_shell_into(
            ShellRef {
                center: ca,
                l: la,
                exps: &ea1,
                coeffs: &one,
            },
            ShellRef {
                center: cb,
                l: lb,
                exps: &eb1,
                coeffs: &one,
            },
            ShellRef {
                center: ccc,
                l: lc,
                exps: &ec1,
                coeffs: &one,
            },
            ShellRef {
                center: cdd,
                l: ld,
                exps: &ed1,
                coeffs: &one,
            },
            &mut out,
        );
        out
    }

    /// OS/HGP must reproduce the Rys engine element-for-element on a single
    /// primitive quartet, across a sweep of angular momenta (including the
    /// bug-prone mixed-high-L and "all four different L on four centers" cases).
    #[test]
    fn matches_rys_engine_primitive_sweep() {
        let ca = [0.0, 0.0, 0.0];
        let cb = [0.5, -0.3, 0.2];
        let cc = [-0.4, 0.6, -0.1];
        let cd = [0.2, 0.4, 0.8];
        let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);

        let quartets = [
            (0usize, 0usize, 0usize, 0usize),
            (1, 0, 0, 0),
            (0, 0, 1, 0),
            (1, 1, 0, 0),
            (1, 0, 1, 0),
            (1, 1, 1, 1),
            (2, 0, 0, 0),
            (2, 1, 0, 0),
            (2, 1, 2, 1),
            (0, 1, 2, 3), // four different L on four centers
            (2, 2, 3, 3), // (dd|ff) mixed high-L
            (3, 0, 0, 1),
            // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
            // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
            // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
            (4, 4, 4, 1), // l_total = 13
            (6, 6, 1, 0), // l_total = 13, two i-shells
            (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
        ];
        for (la, lb, lc, ld) in quartets {
            let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
            let mut rys = vec![0.0; os.len()];
            coulomb_into(
                Prim::new(ea, ca, la),
                Prim::new(eb, cb, lb),
                Prim::new(ec, cc, lc),
                Prim::new(ed, cd, ld),
                1.0,
                &mut rys,
            );
            assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
        }
    }

    /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
    /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
    /// The atol floor absorbs benign near-cancellation on structurally tiny
    /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
    /// even though both engines are correct — their dominant elements agree to
    /// ~1e-12); the rtol catches any real divergence.
    fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
        const ATOL: f64 = 1e-11;
        const RTOL: f64 = 1e-10;
        for (o, r) in os.iter().zip(rys.iter()) {
            assert!(
                (o - r).abs() <= ATOL + RTOL * r.abs(),
                "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
                (o - r).abs()
            );
        }
    }

    /// HGP early contraction (VRR per primitive, HRR once in contracted space)
    /// must equal the per-primitive Rys sum for a genuinely **contracted**
    /// quartet.
    ///
    /// Note (`DESIGN_NOTES.md` D6): doing the HRR after contraction is a *performance*
    /// choice, **not** a correctness fork. The HRR operator is linear with
    /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
    /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
    /// and post-contraction HRR give the *same* answer. This remains a valuable
    /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
    /// coefficient, or a wrong HRR sign would all fail it), but it does not
    /// distinguish HRR *ordering*, because the two orders are algebraically equal.
    #[test]
    fn contracted_quartet_matches_rys_sum() {
        // Two p shells and a d shell, each with multiple primitives, on distinct
        // centres (so HRR shift vectors A−B, C−D are all non-trivial).
        let ca = [0.0, 0.0, 0.0];
        let cb = [0.6, -0.2, 0.1];
        let cc = [-0.3, 0.5, -0.2];
        let cd = [0.2, 0.3, 0.7];
        let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
        let ax = [1.4, 0.45];
        let acf = [0.6, 0.5];
        let bx = [0.9, 0.3];
        let bcf = [0.55, 0.5];
        let cx = [1.1, 0.4];
        let ccf = [0.7, 0.4];
        let dx = [0.8];
        let dcf = [1.0];

        let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
        coulomb_shell_into(
            ShellRef {
                center: ca,
                l: la,
                exps: &ax,
                coeffs: &acf,
            },
            ShellRef {
                center: cb,
                l: lb,
                exps: &bx,
                coeffs: &bcf,
            },
            ShellRef {
                center: cc,
                l: lc,
                exps: &cx,
                coeffs: &ccf,
            },
            ShellRef {
                center: cd,
                l: ld,
                exps: &dx,
                coeffs: &dcf,
            },
            &mut os,
        );

        // Reference: sum the Rys primitive engine over the quartet with the same
        // effective coefficients.
        let mut rys = vec![0.0; os.len()];
        for (&ea, &wa) in ax.iter().zip(acf.iter()) {
            for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
                for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
                    for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
                        coulomb_into(
                            Prim::new(ea, ca, la),
                            Prim::new(eb, cb, lb),
                            Prim::new(ec, cc, lc),
                            Prim::new(ed, cd, ld),
                            wa * wb * wc * wd,
                            &mut rys,
                        );
                    }
                }
            }
        }

        for (o, r) in os.iter().zip(rys.iter()) {
            assert!(
                (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
                "contracted OS vs Rys mismatch: {o} vs {r}"
            );
        }
    }

    /// The triple-address map must be a bijection onto `0..n_addr(L)`.
    #[test]
    fn addr_is_bijective() {
        for lmax in 0..=6 {
            let mut seen = vec![false; n_addr(lmax)];
            for n in 0..=lmax {
                for t in cart_components(n) {
                    let a = addr(t);
                    assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
                    assert!(!seen[a], "addr collision at {t:?}");
                    seen[a] = true;
                }
            }
            assert!(
                seen.iter().all(|&x| x),
                "addr not surjective at lmax {lmax}"
            );
        }
    }
}