integral-core 0.1.0

//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
//!
//! A **vertical recurrence (VRR)** builds the
//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
//! the primitives are **contracted** into AO space, and a **horizontal recurrence
//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
//! Doing the HRR after contraction is the HGP "early contraction" trick: the
//! geometry-only HRR runs **once per shell quartet** instead of once per
//! primitive quartet, which is the win at high contraction degree.
//!
//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
//! against it (and against an independent McMurchie–Davidson path).
//!
//! ## Method
//!
//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
//!
//! ```text
//!   [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
//! ```
//!
//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
//!
//! ```text
//!   [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
//!       + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
//!       + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
//!   [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
//!       + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
//!       + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
//! ```
//!
//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
//! Cartesian shell block:
//!
//! ```text
//!   (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0)   [bra, A→B]
//!   (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d)   [ket, C→D]
//! ```
//!
//! ## Buffers (see `DESIGN_NOTES.md` D3, D12)
//!
//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
//! `(ii|ii)`. The current engine (D12) removes that:
//!
//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
//!   `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
//!   of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
//!   index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
//!   `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
//!   / [`cart_index`] with two rolling degree layers, replacing the former HashMap
//!   memoisation.
//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
//!   across quartets (thread-local by default), not re-allocated per quartet.
//!
//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
//! former full-table engine's values, cross-checked against the Rys engine and an
//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).

use std::cell::RefCell;

use integral_math::am::{cart_components, cart_index, n_cart};
use integral_math::boys::boys_array;

use crate::os::{Vec3, MAX_L};

/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
/// momentum, and primitive `(exponent, effective-coefficient)` data. The
/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
/// engine itself works on un-normalised monomials and only multiplies the four
/// coefficients into the contracted accumulator.
#[derive(Debug, Clone, Copy)]
pub struct ShellRef<'a> {
    /// Shell centre (bohr).
    pub center: Vec3,
    /// Angular momentum.
    pub l: usize,
    /// Primitive exponents.
    pub exps: &'a [f64],
    /// Effective contraction coefficients, aligned with `exps`.
    pub coeffs: &'a [f64],
}

/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
#[inline]
fn n_addr(lmax: usize) -> usize {
    (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
}

/// Number of Cartesian triples of degree **strictly less than** `d`:
/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
#[inline]
fn tri_below(d: usize) -> usize {
    d * (d + 1) * (d + 2) / 6
}

/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
/// `0..n_addr(deg)`.
#[inline]
fn addr(t: [usize; 3]) -> usize {
    let n = t[0] + t[1] + t[2];
    let base = n * (n + 1) * (n + 2) / 6;
    // within-degree index: (n-lx)(n-lx+1)/2 + lz
    let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
    base + within
}

/// Reusable scratch arena for the OS/HGP ERI engine.
///
/// Holds the engine's working buffers so they are allocated **once and reused**
/// across shell quartets, instead of a fresh heap allocation per quartet inside the
/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
/// thread uses its own instance — the thread-local default behind
/// [`coulomb_shell_into`], or one passed explicitly to
/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
/// error).
///
/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
/// they are later read, so they need no functional zeroing — but in debug builds
/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
/// rather than silently reading a stale value from a previous quartet. Results are
/// therefore independent of evaluation order and of arena reuse (asserted by
/// `tests/arena.rs`).
#[derive(Debug, Default, Clone)]
pub struct EriScratch {
    /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
    levels: Vec<f64>,
    /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
    c_ef: Vec<f64>,
    /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
    bra: Vec<f64>,
    /// HRR bra rolling `b`-degree layers.
    bra_prev: Vec<f64>,
    bra_cur: Vec<f64>,
    /// HRR ket rolling `d`-degree layers.
    ket_prev: Vec<f64>,
    ket_cur: Vec<f64>,
}

impl EriScratch {
    /// A fresh, empty arena; it grows to fit on first use.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Total `f64` elements currently held across all buffers — the resident
    /// working set, for memory reporting/tests.
    #[must_use]
    pub fn resident_f64(&self) -> usize {
        self.levels.len()
            + self.c_ef.len()
            + self.bra.len()
            + self.bra_prev.len()
            + self.bra_cur.len()
            + self.ket_prev.len()
            + self.ket_cur.len()
    }

    /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
    /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
    /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
    #[must_use]
    pub fn largest_buffer_f64(&self) -> usize {
        [
            self.levels.len(),
            self.c_ef.len(),
            self.bra.len(),
            self.bra_prev.len(),
            self.bra_cur.len(),
            self.ket_prev.len(),
            self.ket_cur.len(),
        ]
        .into_iter()
        .max()
        .unwrap_or(0)
    }
}

thread_local! {
    /// Per-thread default arena backing [`coulomb_shell_into`].
    static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
}

/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
#[inline]
fn ensure_len(v: &mut Vec<f64>, n: usize) {
    if v.len() < n {
        v.resize(n, 0.0);
    }
}

/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
///
/// The engine owns the primitive-quartet loop (the HGP contraction happens between
/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
/// explicit control (e.g. one arena per worker thread) call
/// [`coulomb_shell_into_scratch`].
///
/// # Panics
/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
pub fn coulomb_shell_into(
    a: ShellRef<'_>,
    b: ShellRef<'_>,
    c: ShellRef<'_>,
    d: ShellRef<'_>,
    out: &mut [f64],
) {
    ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
}

/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
/// compile error); the result is bit-identical regardless of which arena is used or
/// what it last held.
///
/// # Panics
/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
pub fn coulomb_shell_into_scratch(
    scratch: &mut EriScratch,
    a: ShellRef<'_>,
    b: ShellRef<'_>,
    c: ShellRef<'_>,
    d: ShellRef<'_>,
    out: &mut [f64],
) {
    let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
    debug_assert!(
        la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
        "angular momentum exceeds MAX_L"
    );
    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
    debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");

    let ne = la + lb; // max bra (A-side) degree built by VRR
    let nf = lc + ld; // max ket (C-side) degree built by VRR
    let l_total = ne + nf;
    let n_e = n_addr(ne);
    let n_f = n_addr(nf);

    let EriScratch {
        levels,
        c_ef,
        bra,
        bra_prev,
        bra_cur,
        ket_prev,
        ket_cur,
    } = scratch;

    // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
    ensure_len(c_ef, n_e * n_f);
    c_ef[..n_e * n_f].fill(0.0);

    // Precompute the triples for each degree once (canonical order).
    let tri_e: Vec<Vec<[usize; 3]>> = (0..=ne).map(cart_components).collect();
    let tri_f: Vec<Vec<[usize; 3]>> = (0..=nf).map(cart_components).collect();

    // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
    // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
    // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
    // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
    // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
    // packed size and `eoff[k][ae]` is `e`'s offset within it (see `vrr_primitive`).
    let mut eoff: Vec<Vec<usize>> = Vec::with_capacity(nf + 1);
    let mut slab: Vec<usize> = Vec::with_capacity(nf + 1);
    for k in 0..=nf {
        let mut off = Vec::with_capacity(n_e);
        let mut run = 0usize;
        for de in 0..=ne {
            let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
            for _ in 0..n_cart(de) {
                off.push(run);
                run += mlen;
            }
        }
        eoff.push(off);
        slab.push(run);
    }
    let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
    ensure_len(levels, 3 * maxlevel);
    // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
    // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
    // instead of silently using a previous quartet's value.
    #[cfg(debug_assertions)]
    levels[..3 * maxlevel].fill(f64::NAN);

    for (&ea, &ca) in a.exps.iter().zip(a.coeffs.iter()) {
        for (&eb, &cb) in b.exps.iter().zip(b.coeffs.iter()) {
            let p = ea + eb;
            let pc = combine(ea, a.center, eb, b.center, p);
            let kab = (-(ea * eb / p) * dist2(a.center, b.center)).exp();
            for (&ec, &cc) in c.exps.iter().zip(c.coeffs.iter()) {
                for (&ed, &cd) in d.exps.iter().zip(d.coeffs.iter()) {
                    let q = ec + ed;
                    let qc = combine(ec, c.center, ed, d.center, q);
                    let kcd = (-(ec * ed / q) * dist2(c.center, d.center)).exp();
                    let scale = ca * cb * cc * cd;
                    vrr_primitive(
                        p, q, pc, qc, kab, kcd, a.center, c.center, ne, nf, l_total, n_e, n_f,
                        maxlevel, &eoff, &slab, &tri_e, &tri_f, levels, scale, c_ef,
                    );
                }
            }
        }
    }

    // HRR in contracted space, then scatter the block.
    let ab = sub(a.center, b.center); // A − B
    let cd = sub(c.center, d.center); // C − D
    hrr_and_scatter(
        la, lb, lc, ld, n_f, c_ef, ab, cd, out, bra, bra_prev, bra_cur, ket_prev, ket_cur,
    );
}

/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
///
/// Algebraically identical to the former full-table VRR (same OS recurrences, same
/// term order) — only the storage changes, so it is bit-identical
/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
/// buffer is recycled.
#[allow(clippy::too_many_arguments)]
fn vrr_primitive(
    p: f64,
    q: f64,
    pc: Vec3,
    qc: Vec3,
    kab: f64,
    kcd: f64,
    a_center: Vec3,
    c_center: Vec3,
    ne: usize,
    nf: usize,
    l_total: usize,
    n_e: usize,
    n_f: usize,
    maxlevel: usize,
    eoff: &[Vec<usize>],
    slab: &[usize],
    tri_e: &[Vec<[usize; 3]>],
    tri_f: &[Vec<[usize; 3]>],
    levels: &mut [f64],
    scale: f64,
    c_ef: &mut [f64],
) {
    let pq = p + q;
    let rho = p * q / pq;
    let pq_vec = sub(pc, qc); // P − Q
    let t = rho * norm2(pq_vec);
    let w = [
        (p * pc[0] + q * qc[0]) / pq,
        (p * pc[1] + q * qc[1]) / pq,
        (p * pc[2] + q * qc[2]) / pq,
    ];
    let pa = sub(pc, a_center); // P − A
    let wp = sub(w, pc); // W − P
    let qcen = sub(qc, c_center); // Q − C
    let wq = sub(w, qc); // W − Q

    // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
    // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
    // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
    // source and destination across the distinct slots `k`, `k−1`, `k−2`.
    let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
        (k % 3) * maxlevel + lf * slab[k] + eoff[k][ae] + m
    };

    // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
    // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
    // bound; an ERI couples both electrons).
    use std::f64::consts::PI;
    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
    let mut fm = [0.0f64; 4 * MAX_L + 1];
    boys_array(l_total, t, &mut fm[..=l_total]);
    for m in 0..=l_total {
        levels[off(0, 0, 0, m)] = pref * fm[m];
    }

    let inv_2p = 0.5 / p;
    let inv_2q = 0.5 / q;
    let inv_2pq = 0.5 / pq;
    let q_over_pq = q / pq;
    let p_over_pq = p / pq;

    // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
    // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
    // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
    let eoff0 = &eoff[0];
    for (na, te_list) in tri_e.iter().enumerate().take(ne + 1).skip(1) {
        for &te in te_list {
            let i = lower_axis(te);
            let s1a = addr(dec(te, i));
            let mmax = l_total - na;
            let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
            let has2 = te[i] >= 2;
            let s1_0 = eoff0[s1a];
            let s2_0 = if has2 {
                eoff0[addr(dec(dec(te, i), i))]
            } else {
                0
            };
            let dst_0 = eoff0[addr(te)];
            for m in 0..=mmax {
                let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
                if has2 {
                    v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
                }
                levels[dst_0 + m] = v;
            }
        }
    }
    // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
    for ae in 0..n_e {
        c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
    }

    // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
    // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
    // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
    // the former full-table inner-loop cost. The k−2 level is read only when the axis
    // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
    for k in 1..=nf {
        let slot_k = (k % 3) * maxlevel;
        let slot_k1 = ((k - 1) % 3) * maxlevel;
        let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
        let eoff_k = &eoff[k];
        let eoff_k1 = &eoff[k - 1];
        let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
            (((k - 2) % 3) * maxlevel, slab[k - 2], &eoff[k - 2])
        } else {
            (0, 0, &[])
        };
        for &tf in &tri_f[k] {
            let j = lower_axis(tf);
            let f1 = dec(tf, j);
            let lf1 = cart_index(f1); // local index in level k−1
            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
            let has2 = tf[j] >= 2;
            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
            let lf = cart_index(tf); // local index in level k
            let sk = slot_k + lf * slab_k;
            let sk1 = slot_k1 + lf1 * slab_k1;
            let sk2 = slot_k2 + lf2 * slab_k2;
            for (nadeg, te_list) in tri_e.iter().enumerate().take(ne + 1) {
                for &te in te_list {
                    let ea = addr(te);
                    // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
                    let has_cross = te[j] >= 1;
                    let (cross_coef, cross_0) = if has_cross {
                        (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
                    } else {
                        (0.0, 0)
                    };
                    let mmax = l_total - nadeg - k;
                    let dst_0 = sk + eoff_k[ea];
                    let src1_0 = sk1 + eoff_k1[ea];
                    let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
                    for m in 0..=mmax {
                        let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
                        if has2 {
                            v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
                        }
                        if has_cross {
                            v += cross_coef * levels[cross_0 + m + 1];
                        }
                        levels[dst_0 + m] = v;
                    }
                }
            }
        }
        // Extract level k: contract each f-component's m = 0 slice into c_ef.
        for &tf in &tri_f[k] {
            let lf = cart_index(tf);
            let af = addr(tf);
            for ae in 0..n_e {
                c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
            }
        }
    }
}

/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
/// the row-major output block.
///
/// Flat-array HGP horizontal recurrence, replacing the earlier
/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
/// term order — so it is **bit-identical** to the recursive version (guarded by
/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
///
/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
/// independently per ket index `f` (a spectator), into the `bra` intermediate.
/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
#[allow(clippy::too_many_arguments)]
fn hrr_and_scatter<'a>(
    la: usize,
    lb: usize,
    lc: usize,
    ld: usize,
    n_f: usize,
    c_ef: &[f64],
    ab: Vec3,
    cd: Vec3,
    out: &mut [f64],
    bra: &mut Vec<f64>,
    mut prev: &'a mut Vec<f64>,
    mut cur: &'a mut Vec<f64>,
    mut kprev: &'a mut Vec<f64>,
    mut kcur: &'a mut Vec<f64>,
) {
    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
    let ne = la + lb; // max bra (A-side) degree present in c_ef
    let nf = lc + ld; // max ket (C-side) degree present in c_ef
    let n_e = n_addr(ne);

    // Cartesian triples by degree, reused across the recurrences.
    let tri: Vec<Vec<[usize; 3]>> = (0..=ne.max(nf)).map(cart_components).collect();

    // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
    // the ket base are keyed by the global `addr(f)` offset by `f_base`.
    let f_base = tri_below(lc);
    let nf_range = n_f - f_base;

    // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
    // Reused arena buffers; bra is fully overwritten below, the rolling
    // layers in the region read — debug NaN-fill bra so a missed write surfaces.
    let bra_len = na * nb * nf_range;
    ensure_len(bra, bra_len);
    #[cfg(debug_assertions)]
    bra[..bra_len].fill(f64::NAN);
    // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
    let layer_len = n_cart(lb) * n_e;
    ensure_len(prev, layer_len);
    ensure_len(cur, layer_len);
    for f_global in f_base..n_f {
        let jf = f_global - f_base;
        // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
        for &a in tri[la..=ne].iter().flatten() {
            let ae = addr(a);
            prev[ae] = c_ef[ae * n_f + f_global];
        }
        for kb in 1..=lb {
            for (ibw, &b) in tri[kb].iter().enumerate() {
                let i = lower_axis(b);
                let b1w = cart_index(dec(b, i));
                // a-degrees that can still reach (la, lb): [la..=ne−kb].
                for &a in tri[la..=(ne - kb)].iter().flatten() {
                    let ae = addr(a);
                    let a1e = addr(inc(a, i));
                    cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
                }
            }
            std::mem::swap(&mut prev, &mut cur);
        }
        // `prev` now holds b-degree lb (or the base when lb = 0): extract.
        for (ib, &b) in tri[lb].iter().enumerate() {
            let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
            for (ia, &a) in tri[la].iter().enumerate() {
                bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
            }
        }
    }

    // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
    let klayer_len = n_cart(ld) * n_f;
    ensure_len(kprev, klayer_len);
    ensure_len(kcur, klayer_len);
    for ia in 0..na {
        for ib in 0..nb {
            let brarow = (ia * nb + ib) * nf_range;
            // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
            for &c in tri[lc..=nf].iter().flatten() {
                let ce = addr(c);
                kprev[ce] = bra[brarow + (ce - f_base)];
            }
            for kd in 1..=ld {
                for (idw, &d) in tri[kd].iter().enumerate() {
                    let j = lower_axis(d);
                    let d1w = cart_index(dec(d, j));
                    for &c in tri[lc..=(nf - kd)].iter().flatten() {
                        let ce = addr(c);
                        let c1e = addr(inc(c, j));
                        kcur[idw * n_f + ce] =
                            kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
                    }
                }
                std::mem::swap(&mut kprev, &mut kcur);
            }
            // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
            for (ic, &c) in tri[lc].iter().enumerate() {
                let ce = addr(c);
                for id in 0..nd {
                    out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
                }
            }
        }
    }
}

// --- small helpers ---

/// First nonzero axis of a triple (the lowering direction).
#[inline]
fn lower_axis(t: [usize; 3]) -> usize {
    if t[0] > 0 {
        0
    } else if t[1] > 0 {
        1
    } else {
        2
    }
}

#[inline]
fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
    t[i] -= 1;
    t
}

#[inline]
fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
    t[i] += 1;
    t
}

#[inline]
fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
    [
        (a * ca[0] + b * cb[0]) / p,
        (a * ca[1] + b * cb[1]) / p,
        (a * ca[2] + b * cb[2]) / p,
    ]
}

#[inline]
fn sub(u: Vec3, v: Vec3) -> Vec3 {
    [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
}

#[inline]
fn dist2(u: Vec3, v: Vec3) -> f64 {
    norm2(sub(u, v))
}

#[inline]
fn norm2(u: Vec3) -> f64 {
    u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Single-primitive `ShellRef` helper.
    fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
        (center, l, [exp], [1.0])
    }

    /// (ss|ss) over four unit s primitives must equal the closed form
    /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
    /// engine passes, so the two share a verified base case.
    #[test]
    fn ssss_matches_closed_form() {
        let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
        let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
        let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
        let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
        let mut out = [0.0; 1];
        coulomb_shell_into(
            ShellRef {
                center: ac,
                l: al,
                exps: &ae,
                coeffs: &acf,
            },
            ShellRef {
                center: bc,
                l: bl,
                exps: &be,
                coeffs: &bcf,
            },
            ShellRef {
                center: cc,
                l: cl,
                exps: &ce,
                coeffs: &ccf,
            },
            ShellRef {
                center: dc,
                l: dl,
                exps: &de,
                coeffs: &dcf,
            },
            &mut out,
        );

        let p = 0.8 + 1.3;
        let q = 0.5 + 1.1;
        let pcen = combine(0.8, ac, 1.3, bc, p);
        let qcen = combine(0.5, cc, 1.1, dc, q);
        let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
        let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
        let rho = p * q / (p + q);
        let t = rho * dist2(pcen, qcen);
        let mut fm = [0.0; 1];
        boys_array(0, t, &mut fm);
        use std::f64::consts::PI;
        let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
        assert!(
            (out[0] - expect).abs() < 1e-14 * expect.abs(),
            "ssss {} vs {}",
            out[0],
            expect
        );
    }

    use crate::os::Prim;
    use crate::rys::coulomb_into;

    /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
    /// center)`, for cross-checking against the Rys engine.
    #[allow(clippy::too_many_arguments)]
    fn os_block(
        la: usize,
        ea: f64,
        ca: Vec3,
        lb: usize,
        eb: f64,
        cb: Vec3,
        lc: usize,
        recc: f64,
        ccc: Vec3,
        ld: usize,
        ed: f64,
        cdd: Vec3,
    ) -> Vec<f64> {
        let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
        let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
        let one = [1.0];
        coulomb_shell_into(
            ShellRef {
                center: ca,
                l: la,
                exps: &ea1,
                coeffs: &one,
            },
            ShellRef {
                center: cb,
                l: lb,
                exps: &eb1,
                coeffs: &one,
            },
            ShellRef {
                center: ccc,
                l: lc,
                exps: &ec1,
                coeffs: &one,
            },
            ShellRef {
                center: cdd,
                l: ld,
                exps: &ed1,
                coeffs: &one,
            },
            &mut out,
        );
        out
    }

    /// OS/HGP must reproduce the Rys engine element-for-element on a single
    /// primitive quartet, across a sweep of angular momenta (including the
    /// bug-prone mixed-high-L and "all four different L on four centers" cases).
    #[test]
    fn matches_rys_engine_primitive_sweep() {
        let ca = [0.0, 0.0, 0.0];
        let cb = [0.5, -0.3, 0.2];
        let cc = [-0.4, 0.6, -0.1];
        let cd = [0.2, 0.4, 0.8];
        let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);

        let quartets = [
            (0usize, 0usize, 0usize, 0usize),
            (1, 0, 0, 0),
            (0, 0, 1, 0),
            (1, 1, 0, 0),
            (1, 0, 1, 0),
            (1, 1, 1, 1),
            (2, 0, 0, 0),
            (2, 1, 0, 0),
            (2, 1, 2, 1),
            (0, 1, 2, 3), // four different L on four centers
            (2, 2, 3, 3), // (dd|ff) mixed high-L
            (3, 0, 0, 1),
            // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
            // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
            // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
            (4, 4, 4, 1), // l_total = 13
            (6, 6, 1, 0), // l_total = 13, two i-shells
            (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
        ];
        for (la, lb, lc, ld) in quartets {
            let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
            let mut rys = vec![0.0; os.len()];
            coulomb_into(
                Prim::new(ea, ca, la),
                Prim::new(eb, cb, lb),
                Prim::new(ec, cc, lc),
                Prim::new(ed, cd, ld),
                1.0,
                &mut rys,
            );
            assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
        }
    }

    /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
    /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
    /// The atol floor absorbs benign near-cancellation on structurally tiny
    /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
    /// even though both engines are correct — their dominant elements agree to
    /// ~1e-12); the rtol catches any real divergence.
    fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
        const ATOL: f64 = 1e-11;
        const RTOL: f64 = 1e-10;
        for (o, r) in os.iter().zip(rys.iter()) {
            assert!(
                (o - r).abs() <= ATOL + RTOL * r.abs(),
                "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
                (o - r).abs()
            );
        }
    }

    /// HGP early contraction (VRR per primitive, HRR once in contracted space)
    /// must equal the per-primitive Rys sum for a genuinely **contracted**
    /// quartet.
    ///
    /// Note (`DESIGN_NOTES.md` D6): doing the HRR after contraction is a *performance*
    /// choice, **not** a correctness fork. The HRR operator is linear with
    /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
    /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
    /// and post-contraction HRR give the *same* answer. This remains a valuable
    /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
    /// coefficient, or a wrong HRR sign would all fail it), but it does not
    /// distinguish HRR *ordering*, because the two orders are algebraically equal.
    #[test]
    fn contracted_quartet_matches_rys_sum() {
        // Two p shells and a d shell, each with multiple primitives, on distinct
        // centres (so HRR shift vectors A−B, C−D are all non-trivial).
        let ca = [0.0, 0.0, 0.0];
        let cb = [0.6, -0.2, 0.1];
        let cc = [-0.3, 0.5, -0.2];
        let cd = [0.2, 0.3, 0.7];
        let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
        let ax = [1.4, 0.45];
        let acf = [0.6, 0.5];
        let bx = [0.9, 0.3];
        let bcf = [0.55, 0.5];
        let cx = [1.1, 0.4];
        let ccf = [0.7, 0.4];
        let dx = [0.8];
        let dcf = [1.0];

        let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
        coulomb_shell_into(
            ShellRef {
                center: ca,
                l: la,
                exps: &ax,
                coeffs: &acf,
            },
            ShellRef {
                center: cb,
                l: lb,
                exps: &bx,
                coeffs: &bcf,
            },
            ShellRef {
                center: cc,
                l: lc,
                exps: &cx,
                coeffs: &ccf,
            },
            ShellRef {
                center: cd,
                l: ld,
                exps: &dx,
                coeffs: &dcf,
            },
            &mut os,
        );

        // Reference: sum the Rys primitive engine over the quartet with the same
        // effective coefficients.
        let mut rys = vec![0.0; os.len()];
        for (&ea, &wa) in ax.iter().zip(acf.iter()) {
            for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
                for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
                    for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
                        coulomb_into(
                            Prim::new(ea, ca, la),
                            Prim::new(eb, cb, lb),
                            Prim::new(ec, cc, lc),
                            Prim::new(ed, cd, ld),
                            wa * wb * wc * wd,
                            &mut rys,
                        );
                    }
                }
            }
        }

        for (o, r) in os.iter().zip(rys.iter()) {
            assert!(
                (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
                "contracted OS vs Rys mismatch: {o} vs {r}"
            );
        }
    }

    /// The triple-address map must be a bijection onto `0..n_addr(L)`.
    #[test]
    fn addr_is_bijective() {
        for lmax in 0..=6 {
            let mut seen = vec![false; n_addr(lmax)];
            for n in 0..=lmax {
                for t in cart_components(n) {
                    let a = addr(t);
                    assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
                    assert!(!seen[a], "addr collision at {t:?}");
                    seen[a] = true;
                }
            }
            assert!(
                seen.iter().all(|&x| x),
                "addr not surjective at lmax {lmax}"
            );
        }
    }
}