Skip to main content

integral_core/
os_eri.rs

1//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
2//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
3//!
4//! A **vertical recurrence (VRR)** builds the
5//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
6//! the primitives are **contracted** into AO space, and a **horizontal recurrence
7//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
8//! Doing the HRR after contraction is the HGP "early contraction" trick: the
9//! geometry-only HRR runs **once per shell quartet** instead of once per
10//! primitive quartet, which is the win at high contraction degree.
11//!
12//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
13//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
14//! against it (and against an independent McMurchie–Davidson path).
15//!
16//! ## Method
17//!
18//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
19//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
20//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
21//!
22//! ```text
23//!   [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
24//! ```
25//!
26//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
27//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
28//!
29//! ```text
30//!   [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
31//!       + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
32//!       + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
33//!   [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
34//!       + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
35//!       + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
36//! ```
37//!
38//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
39//! Cartesian shell block:
40//!
41//! ```text
42//!   (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0)   [bra, A→B]
43//!   (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d)   [ket, C→D]
44//! ```
45//!
46//! ## Buffers
47//!
48//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
49//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
50//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
51//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
52//! `(ii|ii)`. The current engine removes that:
53//!
54//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
55//!   `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
56//!   of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
57//!   index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
58//!   `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
59//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
60//!   / [`cart_index`] with two rolling degree layers, replacing the former HashMap
61//!   memoisation.
62//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
63//!   across quartets (thread-local by default), not re-allocated per quartet.
64//!
65//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
66//! former full-table engine's values, cross-checked against the Rys engine and an
67//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).
68
69use std::cell::RefCell;
70
71use integral_math::am::{cart_components, cart_index, n_cart};
72use integral_math::boys::boys_array;
73
74use crate::os::{Vec3, MAX_L};
75
76/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
77/// momentum, and primitive `(exponent, effective-coefficient)` data. The
78/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
79/// engine itself works on un-normalised monomials and only multiplies the four
80/// coefficients into the contracted accumulator.
81#[derive(Debug, Clone, Copy)]
82pub struct ShellRef<'a> {
83    /// Shell centre (bohr).
84    pub center: Vec3,
85    /// Angular momentum.
86    pub l: usize,
87    /// Primitive exponents.
88    pub exps: &'a [f64],
89    /// Effective contraction coefficients, aligned with `exps`.
90    pub coeffs: &'a [f64],
91}
92
93/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
94#[inline]
95fn n_addr(lmax: usize) -> usize {
96    (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
97}
98
99/// Number of Cartesian triples of degree **strictly less than** `d`:
100/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
101/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
102#[inline]
103fn tri_below(d: usize) -> usize {
104    d * (d + 1) * (d + 2) / 6
105}
106
107/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
108/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
109/// `0..n_addr(deg)`.
110#[inline]
111fn addr(t: [usize; 3]) -> usize {
112    let n = t[0] + t[1] + t[2];
113    let base = n * (n + 1) * (n + 2) / 6;
114    // within-degree index: (n-lx)(n-lx+1)/2 + lz
115    let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
116    base + within
117}
118
119/// Reusable scratch arena for the OS/HGP ERI engine.
120///
121/// Holds the engine's working buffers so they are allocated **once and reused**
122/// across shell quartets, instead of a fresh heap allocation per quartet inside the
123/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
124/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
125/// thread uses its own instance — the thread-local default behind
126/// [`coulomb_shell_into`], or one passed explicitly to
127/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
128/// error).
129///
130/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
131/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
132/// they are later read, so they need no functional zeroing — but in debug builds
133/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
134/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
135/// rather than silently reading a stale value from a previous quartet. Results are
136/// therefore independent of evaluation order and of arena reuse (asserted by
137/// `tests/arena.rs`).
138#[derive(Debug, Default, Clone)]
139pub struct EriScratch {
140    /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
141    levels: Vec<f64>,
142    /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
143    c_ef: Vec<f64>,
144    /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
145    bra: Vec<f64>,
146    /// HRR bra rolling `b`-degree layers.
147    bra_prev: Vec<f64>,
148    bra_cur: Vec<f64>,
149    /// HRR ket rolling `d`-degree layers.
150    ket_prev: Vec<f64>,
151    ket_cur: Vec<f64>,
152    /// Precomputed bra/ket primitive-pair data (combined exponent, product
153    /// centre, `K` prefactor, coeff product) — built once per shell quartet so
154    /// the per-pair `exp`/centre work is not repeated inside the deep primitive
155    /// loop. Reused across quartets like the other buffers.
156    bra_pairs: Vec<PrimPair>,
157    ket_pairs: Vec<PrimPair>,
158    /// Shared Cartesian-triple table `tri_all[d] = cart_components(d)` for every
159    /// degree the engine needs (`0..=2·MAX_L`). Built once and sliced for the VRR
160    /// `e`/`f` lists and the HRR triples, instead of re-`collect`ing per quartet.
161    tri_all: Vec<Vec<[usize; 3]>>,
162    /// Flat VRR offset table: `eoff[k·n_e + ae]` is `e`'s packed offset in level
163    /// `k`. Rebuilt per quartet into this reused buffer (no per-quartet `Vec<Vec>`
164    /// allocation); `slab[k]` is one `f`-component's packed size for level `k`.
165    eoff: Vec<usize>,
166    slab: Vec<usize>,
167}
168
169/// One primitive pair's reusable geometry/prefactor, shared by every primitive
170/// quartet that uses it: the combined exponent `zeta = α+β`, the Gaussian product
171/// centre, the overlap prefactor `K = exp(−αβ/ζ·|A−B|²)`, and the two contraction
172/// coefficients (`c1` outer, `c2` inner).
173///
174/// The coefficients are kept **un-multiplied** so the contracted `scale` can be
175/// formed in exactly the former evaluation order `((c_a·c_b)·c_c)·c_d` — making the
176/// engine's output bit-identical to the pre-precomputation code (the per-element
177/// and 8-fold-symmetry tests are floorless-relative and sensitive to the last ULP
178/// on near-cancellation elements).
179#[derive(Debug, Clone, Copy, Default)]
180struct PrimPair {
181    zeta: f64,
182    center: Vec3,
183    kappa: f64,
184    c1: f64,
185    c2: f64,
186    /// `1/(2ζ)` — the VRR `inv_2p`(bra)/`inv_2q`(ket) coefficient. Pure per-pair.
187    inv_2zeta: f64,
188    /// `centre − s1.centre` — the VRR `P−A`(bra)/`Q−C`(ket) shift. Pure per-pair.
189    r1: Vec3,
190}
191
192/// A primitive pair whose weighted overlap prefactor `K·|c₁·c₂|` falls below
193/// this bound contributes at most ~`1e-24` (absolutely) to any tensor element —
194/// roughly twelve orders below the engine's `1e-12` accuracy bar — so it is
195/// dropped from the pair list before the quartet loop. Every `[e0|f0]^(m)` term
196/// the pair could produce is bounded by `K·|c₁·c₂|` times the other pair's
197/// weight (≤ its own bound), the `2π^{5/2}/(pq√(p+q))` prefactor, `F_m ≤ 1`,
198/// and the VRR's geometric raise factors; the product stays ≤ ~1e-24 for any
199/// chemically sensible exponents/geometry. The deeply contracted cross-centre
200/// pairs this removes have `K` underflowing to `0.0` outright in typical bases
201/// (tight×tight pairs at bonding distances reach `exp(-10⁴)`), where skipping
202/// is *exactly* lossless.
203const PAIR_NEGLIGIBLE: f64 = 1e-32;
204
205/// Fill `out` with the [`PrimPair`] data for every primitive combination of two
206/// shells (outer `s1`, inner `s2`), matching the former in-loop computation
207/// element-for-element. The inter-centre distance is hoisted out of the pair loop.
208/// `inv_2zeta` and `r1` are the pair-only VRR coefficients (`1/2ζ` and the `P−A` /
209/// `Q−C` shift) that the kernel formerly recomputed on every primitive quartet.
210/// Pairs below [`PAIR_NEGLIGIBLE`] are omitted; the surviving pairs keep their
211/// order, so the contraction accumulates the remaining terms in the same
212/// sequence as before.
213fn build_pairs(out: &mut Vec<PrimPair>, s1: ShellRef<'_>, s2: ShellRef<'_>) {
214    out.clear();
215    out.reserve(s1.exps.len() * s2.exps.len());
216    let d2 = dist2(s1.center, s2.center);
217    for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
218        for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
219            let zeta = e1 + e2;
220            let kappa = (-(e1 * e2 / zeta) * d2).exp();
221            if kappa * (c1 * c2).abs() < PAIR_NEGLIGIBLE {
222                continue;
223            }
224            let center = combine(e1, s1.center, e2, s2.center, zeta);
225            out.push(PrimPair {
226                zeta,
227                center,
228                kappa,
229                c1,
230                c2,
231                inv_2zeta: 0.5 / zeta,
232                r1: sub(center, s1.center),
233            });
234        }
235    }
236}
237
238impl EriScratch {
239    /// A fresh, empty arena; it grows to fit on first use.
240    #[must_use]
241    pub fn new() -> Self {
242        Self::default()
243    }
244
245    /// Total `f64` elements currently held across all buffers — the resident
246    /// working set, for memory reporting/tests.
247    #[must_use]
248    pub fn resident_f64(&self) -> usize {
249        self.levels.len()
250            + self.c_ef.len()
251            + self.bra.len()
252            + self.bra_prev.len()
253            + self.bra_cur.len()
254            + self.ket_prev.len()
255            + self.ket_cur.len()
256    }
257
258    /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
259    /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
260    /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
261    #[must_use]
262    pub fn largest_buffer_f64(&self) -> usize {
263        [
264            self.levels.len(),
265            self.c_ef.len(),
266            self.bra.len(),
267            self.bra_prev.len(),
268            self.bra_cur.len(),
269            self.ket_prev.len(),
270            self.ket_cur.len(),
271        ]
272        .into_iter()
273        .max()
274        .unwrap_or(0)
275    }
276}
277
278thread_local! {
279    /// Per-thread default arena backing [`coulomb_shell_into`].
280    static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
281}
282
283/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
284#[inline]
285fn ensure_len(v: &mut Vec<f64>, n: usize) {
286    if v.len() < n {
287        v.resize(n, 0.0);
288    }
289}
290
291/// `ensure_len` for the `usize` offset/slab buffers.
292#[inline]
293fn ensure_usize(v: &mut Vec<usize>, n: usize) {
294    if v.len() < n {
295        v.resize(n, 0);
296    }
297}
298
299/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
300/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
301/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
302///
303/// The engine owns the primitive-quartet loop (the HGP contraction happens between
304/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
305/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
306/// explicit control (e.g. one arena per worker thread) call
307/// [`coulomb_shell_into_scratch`].
308///
309/// # Panics
310/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
311pub fn coulomb_shell_into(
312    a: ShellRef<'_>,
313    b: ShellRef<'_>,
314    c: ShellRef<'_>,
315    d: ShellRef<'_>,
316    out: &mut [f64],
317) {
318    ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
319}
320
321/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
322/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
323/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
324/// compile error); the result is bit-identical regardless of which arena is used or
325/// what it last held.
326///
327/// # Panics
328/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
329pub fn coulomb_shell_into_scratch(
330    scratch: &mut EriScratch,
331    a: ShellRef<'_>,
332    b: ShellRef<'_>,
333    c: ShellRef<'_>,
334    d: ShellRef<'_>,
335    out: &mut [f64],
336) {
337    let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
338    debug_assert!(
339        la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
340        "angular momentum exceeds MAX_L"
341    );
342    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
343    debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");
344
345    let ne = la + lb; // max bra (A-side) degree built by VRR
346    let nf = lc + ld; // max ket (C-side) degree built by VRR
347    let l_total = ne + nf;
348    let n_e = n_addr(ne);
349    let n_f = n_addr(nf);
350
351    let EriScratch {
352        levels,
353        c_ef,
354        bra,
355        bra_prev,
356        bra_cur,
357        ket_prev,
358        ket_cur,
359        bra_pairs,
360        ket_pairs,
361        tri_all,
362        eoff,
363        slab,
364    } = scratch;
365
366    // Fast path: (ss|ss). With no angular momentum to raise, the whole quartet is a
367    // single contracted [00|00]^(0). Skip the VRR/HRR machinery (the dead raise
368    // coefficients, the `levels` buffer round-trip, the per-primitive call) and sum
369    // straight into a register. Bit-identical to the general path: same `pref`/`t`
370    // expressions, same `((c_a·c_b)·c_c)·c_d` scale, same `scale·(pref·F_0)` term,
371    // same bra-outer/ket-inner order — just without the scaffolding.
372    if l_total == 0 {
373        build_pairs(bra_pairs, a, b);
374        build_pairs(ket_pairs, c, d);
375        let two_pi_2_5 =
376            2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
377        let mut acc = 0.0;
378        let mut f0 = [0.0f64; 1];
379        for bra in bra_pairs.iter() {
380            let bc = bra.c1 * bra.c2;
381            for ket in ket_pairs.iter() {
382                let (p, q) = (bra.zeta, ket.zeta);
383                let pq = p + q;
384                let t = (p * q / pq) * dist2(bra.center, ket.center);
385                let pref = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
386                boys_array(0, t, &mut f0);
387                acc += (bc * ket.c1) * ket.c2 * (pref * f0[0]);
388            }
389        }
390        out[0] += acc;
391        return;
392    }
393
394    // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
395    ensure_len(c_ef, n_e * n_f);
396    c_ef[..n_e * n_f].fill(0.0);
397
398    // Shared Cartesian-triple table, built once (degrees `0..=2·MAX_L` cover every
399    // `ne`/`nf`/`max(ne,nf)` the engine reaches) and sliced below — no per-quartet
400    // `cart_components` allocation.
401    if tri_all.len() <= 2 * MAX_L {
402        *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
403    }
404
405    // Precompute the bra/ket primitive-pair data once (the `exp` prefactor and
406    // product centre depend only on the pair, not the quartet), then loop over the
407    // pair lists. The previous code recomputed the ket pair's `exp`/centre on every
408    // bra primitive — `na·nb` times too many. Bra-outer / ket-inner preserves the
409    // exact accumulation order into `c_ef`.
410    build_pairs(bra_pairs, a, b);
411    build_pairs(ket_pairs, c, d);
412
413    // VRR. The small classes (`ne, nf ≤ 3` — every quartet shape of an
414    // spd basis except the d,d-bra/ket pairs, ~96% of a cc-pVDZ-style build)
415    // dispatch to the monomorphized kernels, where the whole structural walk
416    // (triples, addresses, branches, m-ranges) is resolved at compile time;
417    // everything else runs the general m-marching `vrr_primitive`.
418    // `(0,0)` already returned through the (ss|ss) fast path above.
419    match (ne, nf) {
420        (0, 1) => contract_class::<0, 1>(bra_pairs, ket_pairs, c_ef),
421        (1, 0) => contract_class::<1, 0>(bra_pairs, ket_pairs, c_ef),
422        (1, 1) => contract_class::<1, 1>(bra_pairs, ket_pairs, c_ef),
423        (0, 2) => contract_class::<0, 2>(bra_pairs, ket_pairs, c_ef),
424        (2, 0) => contract_class::<2, 0>(bra_pairs, ket_pairs, c_ef),
425        (1, 2) => contract_class::<1, 2>(bra_pairs, ket_pairs, c_ef),
426        (2, 1) => contract_class::<2, 1>(bra_pairs, ket_pairs, c_ef),
427        (2, 2) => contract_class::<2, 2>(bra_pairs, ket_pairs, c_ef),
428        (0, 3) => contract_class::<0, 3>(bra_pairs, ket_pairs, c_ef),
429        (3, 0) => contract_class::<3, 0>(bra_pairs, ket_pairs, c_ef),
430        (1, 3) => contract_class::<1, 3>(bra_pairs, ket_pairs, c_ef),
431        (3, 1) => contract_class::<3, 1>(bra_pairs, ket_pairs, c_ef),
432        (2, 3) => contract_class::<2, 3>(bra_pairs, ket_pairs, c_ef),
433        (3, 2) => contract_class::<3, 2>(bra_pairs, ket_pairs, c_ef),
434        (3, 3) => contract_class::<3, 3>(bra_pairs, ket_pairs, c_ef),
435        _ => {
436            // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
437            // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
438            // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
439            // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
440            // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
441            // packed size and `eoff[k·n_e + ae]` is `e`'s offset within it (flat, reused).
442            ensure_usize(slab, nf + 1);
443            ensure_usize(eoff, (nf + 1) * n_e);
444            // `k` indexes both `slab` and the flat `eoff` block base `k·n_e`, so a range
445            // loop is the clear form here.
446            #[allow(clippy::needless_range_loop)]
447            for k in 0..=nf {
448                let base = k * n_e;
449                let mut run = 0usize;
450                let mut ae = 0usize;
451                for de in 0..=ne {
452                    let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
453                    for _ in 0..n_cart(de) {
454                        eoff[base + ae] = run;
455                        ae += 1;
456                        run += mlen;
457                    }
458                }
459                slab[k] = run;
460            }
461            let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
462            ensure_len(levels, 3 * maxlevel);
463            // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
464            // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
465            // instead of silently using a previous quartet's value.
466            #[cfg(debug_assertions)]
467            levels[..3 * maxlevel].fill(f64::NAN);
468
469            // Reborrow the offset/triple buffers as shared slices for the read-only kernel.
470            let (eoff_s, slab_s, tri_s) = (&eoff[..], &slab[..], &tri_all[..]);
471            for bra in bra_pairs.iter() {
472                let bra_coef = bra.c1 * bra.c2; // = c_a·c_b, hoisted out of the ket loop
473                for ket in ket_pairs.iter() {
474                    // ((c_a·c_b)·c_c)·c_d — the former left-to-right product, bit for bit.
475                    let scale = (bra_coef * ket.c1) * ket.c2;
476                    vrr_primitive(
477                        bra.zeta,
478                        ket.zeta,
479                        bra.center,
480                        ket.center,
481                        bra.kappa,
482                        ket.kappa,
483                        bra.r1,
484                        ket.r1,
485                        bra.inv_2zeta,
486                        ket.inv_2zeta,
487                        ne,
488                        nf,
489                        l_total,
490                        n_e,
491                        n_f,
492                        maxlevel,
493                        eoff_s,
494                        slab_s,
495                        tri_s,
496                        levels,
497                        scale,
498                        c_ef,
499                    );
500                }
501            }
502        }
503    }
504
505    // HRR in contracted space, then scatter the block.
506    let ab = sub(a.center, b.center); // A − B
507    let cd = sub(c.center, d.center); // C − D
508    hrr_and_scatter(
509        la,
510        lb,
511        lc,
512        ld,
513        n_f,
514        c_ef,
515        ab,
516        cd,
517        out,
518        bra,
519        bra_prev,
520        bra_cur,
521        ket_prev,
522        ket_cur,
523        &tri_all[..],
524    );
525}
526
527/// Cartesian triples of degree 1, 2 and 3 in [`cart_components`] order, as `const`
528/// tables so the monomorphized small-class VRR kernels fully unroll over them
529/// (the structural walk — axes, addresses, `has2`/cross branches — then resolves
530/// at compile time instead of being re-derived on every primitive quartet).
531const TRI1: [[usize; 3]; 3] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]];
532const TRI2: [[usize; 3]; 6] = [
533    [2, 0, 0],
534    [1, 1, 0],
535    [1, 0, 1],
536    [0, 2, 0],
537    [0, 1, 1],
538    [0, 0, 2],
539];
540const TRI3: [[usize; 3]; 10] = [
541    [3, 0, 0],
542    [2, 1, 0],
543    [2, 0, 1],
544    [1, 2, 0],
545    [1, 1, 1],
546    [1, 0, 2],
547    [0, 3, 0],
548    [0, 2, 1],
549    [0, 1, 2],
550    [0, 0, 3],
551];
552/// The e-components `0..n_addr(3)` in [`addr`] order with their degree —
553/// the bra index set of the small-class kernels (`NE ≤ 3`).
554const E_COMP: [([usize; 3], usize); 20] = [
555    ([0, 0, 0], 0),
556    ([1, 0, 0], 1),
557    ([0, 1, 0], 1),
558    ([0, 0, 1], 1),
559    ([2, 0, 0], 2),
560    ([1, 1, 0], 2),
561    ([1, 0, 1], 2),
562    ([0, 2, 0], 2),
563    ([0, 1, 1], 2),
564    ([0, 0, 2], 2),
565    ([3, 0, 0], 3),
566    ([2, 1, 0], 3),
567    ([2, 0, 1], 3),
568    ([1, 2, 0], 3),
569    ([1, 1, 1], 3),
570    ([1, 0, 2], 3),
571    ([0, 3, 0], 3),
572    ([0, 2, 1], 3),
573    ([0, 1, 2], 3),
574    ([0, 0, 3], 3),
575];
576
577/// Contract a whole shell quartet of VRR shape `(NE, NF)` (with `NE, NF ≤ 2`)
578/// into `c_ef` using the monomorphized small-class kernel [`vrr_small`].
579///
580/// Same bra-outer / ket-inner primitive loop and the same left-to-right
581/// `((c_a·c_b)·c_c)·c_d` scale as the general path, so the accumulation into
582/// `c_ef` is bit-identical. The VRR working tables live here (overwritten per
583/// primitive quartet in the region read, like the general path's `levels`
584/// arena) so their one-time zero-init is per shell quartet, not per primitive.
585fn contract_class<const NE: usize, const NF: usize>(
586    bra_pairs: &[PrimPair],
587    ket_pairs: &[PrimPair],
588    c_ef: &mut [f64],
589) {
590    // Worst-case (NE,NF)=(3,3) sizes: e-table 20 components × m ∈ 0..=6;
591    // f-degree-1 level 20×3 × m ∈ 0..=5; f-degree-2 level 20×6 × m ∈ 0..=4;
592    // f-degree-3 level 20×10 × m ∈ 0..=3.
593    let mut ea = [0.0f64; 140];
594    let mut eb1 = [0.0f64; 360];
595    let mut eb2 = [0.0f64; 600];
596    let mut eb3 = [0.0f64; 800];
597    for bra in bra_pairs {
598        let bra_coef = bra.c1 * bra.c2;
599        for ket in ket_pairs {
600            let scale = (bra_coef * ket.c1) * ket.c2;
601            vrr_small::<NE, NF>(bra, ket, scale, &mut ea, &mut eb1, &mut eb2, &mut eb3, c_ef);
602        }
603    }
604}
605
606/// One primitive quartet of the OS VRR for the small classes `NE, NF ≤ 3`,
607/// monomorphized per `(NE, NF)` — a per-L-class specialized kernel expressed
608/// through const generics: every loop bound, Cartesian triple, address, stride,
609/// and `has2`/cross branch below is a compile-time constant in each
610/// instantiation, so the emitted code is a straight unrolled FMA chain.
611///
612/// **Bit-identical to [`vrr_primitive`] by construction**: the same recurrence
613/// expressions with the same term order and association — base
614/// `pref·F_m`, raise `pa_i·s1[m] + wp_i·s1[m+1]`, then `+= coef2·(s2[m] −
615/// (q/pq)·s2[m+1])`, then `+= cross·s3[m+1]` — the same m-ranges
616/// (`m ≤ l_total − de − k`), and the same `c_ef[ae·n_f + af] += scale·v`
617/// extraction. Only the storage differs (fixed per-degree tables instead of the
618/// triangle-packed rolling window), which does not touch the arithmetic.
619/// Guarded by the full-tensor XOR fingerprint and the golden/cross-engine tests.
620///
621/// Per-degree row strides: the m-rows of `ea` hold `m ∈ 0..=lt` (`sa = lt+1`),
622/// of `eb1` `m ∈ 0..=lt−1` (`s1 = lt`), of `eb2` `m ∈ 0..=lt−2`, of `eb3`
623/// `m ∈ 0..=lt−3` — each level's longest row (its degree-0 e-component).
624#[inline(always)]
625#[allow(clippy::too_many_arguments)]
626fn vrr_small<const NE: usize, const NF: usize>(
627    bra: &PrimPair,
628    ket: &PrimPair,
629    scale: f64,
630    ea: &mut [f64; 140],
631    eb1: &mut [f64; 360],
632    eb2: &mut [f64; 600],
633    eb3: &mut [f64; 800],
634    c_ef: &mut [f64],
635) {
636    let lt = NE + NF;
637    let n_e = n_addr(NE);
638    let n_f = n_addr(NF);
639    // Row strides (compile-time constants per instantiation).
640    let sa = lt + 1;
641    let s1 = lt;
642    let s2 = lt.saturating_sub(1);
643    let s3 = lt.saturating_sub(2);
644    let (p, q) = (bra.zeta, ket.zeta);
645    let (pc, qc) = (bra.center, ket.center);
646    let pa = bra.r1; // P − A
647    let qcen = ket.r1; // Q − C
648    let (inv_2p, inv_2q) = (bra.inv_2zeta, ket.inv_2zeta);
649
650    // Scalar header — identical expressions to `vrr_primitive`.
651    let pq = p + q;
652    let rho = p * q / pq;
653    let pq_vec = sub(pc, qc); // P − Q
654    let t = rho * norm2(pq_vec);
655    let w = [
656        (p * pc[0] + q * qc[0]) / pq,
657        (p * pc[1] + q * qc[1]) / pq,
658        (p * pc[2] + q * qc[2]) / pq,
659    ];
660    let wp = sub(w, pc); // W − P
661    let wq = sub(w, qc); // W − Q
662    use std::f64::consts::PI;
663    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
664    let mut fm = [0.0f64; 7];
665    boys_array(lt, t, &mut fm[..=lt]);
666    for m in 0..=lt {
667        ea[m] = pref * fm[m];
668    }
669
670    let inv_2pq = 0.5 / pq;
671    let q_over_pq = q / pq;
672    let p_over_pq = p / pq;
673
674    // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
675    if NE >= 1 {
676        for (iw, te) in TRI1.iter().enumerate() {
677            let i = lower_axis(*te);
678            // Source is [0,0,0] at addr 0; no has2 term at degree 1.
679            for m in 0..=(lt - 1) {
680                ea[(1 + iw) * sa + m] = pa[i] * ea[m] + wp[i] * ea[m + 1];
681            }
682        }
683    }
684    if NE >= 2 {
685        for (iw, te) in TRI2.iter().enumerate() {
686            let i = lower_axis(*te);
687            let s1a = addr(dec(*te, i));
688            let has2 = te[i] >= 2;
689            let coef2 = ((te[i] - 1) as f64) * inv_2p;
690            let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
691            for m in 0..=(lt - 2) {
692                let mut v = pa[i] * ea[s1a * sa + m] + wp[i] * ea[s1a * sa + m + 1];
693                if has2 {
694                    v += coef2 * (ea[s2a * sa + m] - q_over_pq * ea[s2a * sa + m + 1]);
695                }
696                ea[(4 + iw) * sa + m] = v;
697            }
698        }
699    }
700    if NE >= 3 {
701        for (iw, te) in TRI3.iter().enumerate() {
702            let i = lower_axis(*te);
703            let s1a = addr(dec(*te, i));
704            let has2 = te[i] >= 2;
705            let coef2 = ((te[i] - 1) as f64) * inv_2p;
706            let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
707            for m in 0..=(lt - 3) {
708                let mut v = pa[i] * ea[s1a * sa + m] + wp[i] * ea[s1a * sa + m + 1];
709                if has2 {
710                    v += coef2 * (ea[s2a * sa + m] - q_over_pq * ea[s2a * sa + m + 1]);
711                }
712                ea[(10 + iw) * sa + m] = v;
713            }
714        }
715    }
716    // Extract f-degree 0: contract m = 0 of every e-component.
717    for ae in 0..n_e {
718        c_ef[ae * n_f] += scale * ea[ae * sa];
719    }
720
721    // Phase B — ket raise, f-degree 1: m ≤ lt − de − 1.
722    if NF >= 1 {
723        for (fw, tf) in TRI1.iter().enumerate() {
724            let j = lower_axis(*tf);
725            // Source f-component is [0,0,0] (level 0 = the phase-A table).
726            for ae in 0..n_e {
727                let (te, de) = E_COMP[ae];
728                let has_cross = te[j] >= 1;
729                let cross_coef = te[j] as f64 * inv_2pq;
730                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
731                for m in 0..=(lt - de - 1) {
732                    let mut v = qcen[j] * ea[ae * sa + m] + wq[j] * ea[ae * sa + m + 1];
733                    if has_cross {
734                        v += cross_coef * ea[cs * sa + m + 1];
735                    }
736                    eb1[(ae * 3 + fw) * s1 + m] = v;
737                }
738            }
739        }
740        for ae in 0..n_e {
741            for fw in 0..3 {
742                c_ef[ae * n_f + 1 + fw] += scale * eb1[(ae * 3 + fw) * s1];
743            }
744        }
745    }
746
747    // Phase B — ket raise, f-degree 2: m ≤ lt − de − 2.
748    if NF >= 2 {
749        for (fw, tf) in TRI2.iter().enumerate() {
750            let j = lower_axis(*tf);
751            let f1 = dec(*tf, j);
752            let lf1 = cart_index(f1); // f-component index in the degree-1 level
753            let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
754            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
755            for ae in 0..n_e {
756                let (te, de) = E_COMP[ae];
757                let has_cross = te[j] >= 1;
758                let cross_coef = te[j] as f64 * inv_2pq;
759                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
760                for m in 0..=(lt - de - 2) {
761                    let mut v = qcen[j] * eb1[(ae * 3 + lf1) * s1 + m]
762                        + wq[j] * eb1[(ae * 3 + lf1) * s1 + m + 1];
763                    if has2 {
764                        v += coef2 * (ea[ae * sa + m] - p_over_pq * ea[ae * sa + m + 1]);
765                    }
766                    if has_cross {
767                        v += cross_coef * eb1[(cs * 3 + lf1) * s1 + m + 1];
768                    }
769                    eb2[(ae * 6 + fw) * s2 + m] = v;
770                }
771            }
772        }
773        for ae in 0..n_e {
774            for fw in 0..6 {
775                c_ef[ae * n_f + 4 + fw] += scale * eb2[(ae * 6 + fw) * s2];
776            }
777        }
778    }
779
780    // Phase B — ket raise, f-degree 3: m ≤ lt − de − 3.
781    if NF >= 3 {
782        for (fw, tf) in TRI3.iter().enumerate() {
783            let j = lower_axis(*tf);
784            let f1 = dec(*tf, j);
785            let lf1 = cart_index(f1); // f-component index in the degree-2 level
786            let has2 = tf[j] >= 2; // its k−2 source is f1 lowered again (degree 1)
787            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
788            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 };
789            for ae in 0..n_e {
790                let (te, de) = E_COMP[ae];
791                let has_cross = te[j] >= 1;
792                let cross_coef = te[j] as f64 * inv_2pq;
793                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
794                for m in 0..=(lt - de - 3) {
795                    let mut v = qcen[j] * eb2[(ae * 6 + lf1) * s2 + m]
796                        + wq[j] * eb2[(ae * 6 + lf1) * s2 + m + 1];
797                    if has2 {
798                        v += coef2
799                            * (eb1[(ae * 3 + lf2) * s1 + m]
800                                - p_over_pq * eb1[(ae * 3 + lf2) * s1 + m + 1]);
801                    }
802                    if has_cross {
803                        v += cross_coef * eb2[(cs * 6 + lf1) * s2 + m + 1];
804                    }
805                    eb3[(ae * 10 + fw) * s3 + m] = v;
806                }
807            }
808        }
809        for ae in 0..n_e {
810            for fw in 0..10 {
811                c_ef[ae * n_f + 10 + fw] += scale * eb3[(ae * 10 + fw) * s3];
812            }
813        }
814    }
815}
816
817/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
818/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
819///
820/// Algebraically identical to the former full-table VRR (same OS recurrences, same
821/// term order) — only the storage changes, so it is bit-identical
822/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
823/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
824/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
825/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
826/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
827/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
828/// buffer is recycled.
829#[allow(clippy::too_many_arguments)]
830fn vrr_primitive(
831    p: f64,
832    q: f64,
833    pc: Vec3,
834    qc: Vec3,
835    kab: f64,
836    kcd: f64,
837    pa: Vec3,    // P − A  (bra pair, precomputed)
838    qcen: Vec3,  // Q − C  (ket pair, precomputed)
839    inv_2p: f64, // 1/(2p) (bra pair, precomputed)
840    inv_2q: f64, // 1/(2q) (ket pair, precomputed)
841    ne: usize,
842    nf: usize,
843    l_total: usize,
844    n_e: usize,
845    n_f: usize,
846    maxlevel: usize,
847    eoff: &[usize],
848    slab: &[usize],
849    tri: &[Vec<[usize; 3]>],
850    levels: &mut [f64],
851    scale: f64,
852    c_ef: &mut [f64],
853) {
854    let pq = p + q;
855    let rho = p * q / pq;
856    let pq_vec = sub(pc, qc); // P − Q
857    let t = rho * norm2(pq_vec);
858    let w = [
859        (p * pc[0] + q * qc[0]) / pq,
860        (p * pc[1] + q * qc[1]) / pq,
861        (p * pc[2] + q * qc[2]) / pq,
862    ];
863    // `pa` (P−A) and `qcen` (Q−C) are now passed in (pure per-pair).
864    let wp = sub(w, pc); // W − P
865    let wq = sub(w, qc); // W − Q
866
867    // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
868    // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
869    // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
870    // source and destination across the distinct slots `k`, `k−1`, `k−2`.
871    let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
872        (k % 3) * maxlevel + lf * slab[k] + eoff[k * n_e + ae] + m
873    };
874
875    // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
876    // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
877    // bound; an ERI couples both electrons).
878    use std::f64::consts::PI;
879    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
880    let mut fm = [0.0f64; 4 * MAX_L + 1];
881    boys_array(l_total, t, &mut fm[..=l_total]);
882    for m in 0..=l_total {
883        levels[off(0, 0, 0, m)] = pref * fm[m];
884    }
885
886    // `inv_2p` / `inv_2q` are now passed in (pure per-pair).
887    let inv_2pq = 0.5 / pq;
888    let q_over_pq = q / pq;
889    let p_over_pq = p / pq;
890
891    // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
892    // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
893    // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
894    let eoff0 = &eoff[..n_e];
895    for (na, te_list) in tri.iter().enumerate().take(ne + 1).skip(1) {
896        for &te in te_list {
897            let i = lower_axis(te);
898            let s1a = addr(dec(te, i));
899            let mmax = l_total - na;
900            let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
901            let has2 = te[i] >= 2;
902            let s1_0 = eoff0[s1a];
903            let s2_0 = if has2 {
904                eoff0[addr(dec(dec(te, i), i))]
905            } else {
906                0
907            };
908            let dst_0 = eoff0[addr(te)];
909            for m in 0..=mmax {
910                let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
911                if has2 {
912                    v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
913                }
914                levels[dst_0 + m] = v;
915            }
916        }
917    }
918    // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
919    for ae in 0..n_e {
920        c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
921    }
922
923    // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
924    // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
925    // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
926    // the former full-table inner-loop cost. The k−2 level is read only when the axis
927    // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
928    for k in 1..=nf {
929        let slot_k = (k % 3) * maxlevel;
930        let slot_k1 = ((k - 1) % 3) * maxlevel;
931        let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
932        let eoff_k = &eoff[k * n_e..];
933        let eoff_k1 = &eoff[(k - 1) * n_e..];
934        let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
935            (
936                ((k - 2) % 3) * maxlevel,
937                slab[k - 2],
938                &eoff[(k - 2) * n_e..],
939            )
940        } else {
941            (0, 0, &[])
942        };
943        for &tf in &tri[k] {
944            let j = lower_axis(tf);
945            let f1 = dec(tf, j);
946            let lf1 = cart_index(f1); // local index in level k−1
947            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
948            let has2 = tf[j] >= 2;
949            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
950            let lf = cart_index(tf); // local index in level k
951            let sk = slot_k + lf * slab_k;
952            let sk1 = slot_k1 + lf1 * slab_k1;
953            let sk2 = slot_k2 + lf2 * slab_k2;
954            for (nadeg, te_list) in tri.iter().enumerate().take(ne + 1) {
955                for &te in te_list {
956                    let ea = addr(te);
957                    // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
958                    let has_cross = te[j] >= 1;
959                    let (cross_coef, cross_0) = if has_cross {
960                        (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
961                    } else {
962                        (0.0, 0)
963                    };
964                    let mmax = l_total - nadeg - k;
965                    let dst_0 = sk + eoff_k[ea];
966                    let src1_0 = sk1 + eoff_k1[ea];
967                    let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
968                    for m in 0..=mmax {
969                        let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
970                        if has2 {
971                            v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
972                        }
973                        if has_cross {
974                            v += cross_coef * levels[cross_0 + m + 1];
975                        }
976                        levels[dst_0 + m] = v;
977                    }
978                }
979            }
980        }
981        // Extract level k: contract each f-component's m = 0 slice into c_ef.
982        for &tf in &tri[k] {
983            let lf = cart_index(tf);
984            let af = addr(tf);
985            for ae in 0..n_e {
986                c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
987            }
988        }
989    }
990}
991
992/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
993/// the row-major output block.
994///
995/// Flat-array HGP horizontal recurrence, replacing the earlier
996/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
997/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
998/// term order — so it is **bit-identical** to the recursive version (guarded by
999/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
1000/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
1001///
1002/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
1003/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
1004/// independently per ket index `f` (a spectator), into the `bra` intermediate.
1005/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
1006/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
1007/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
1008/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
1009#[allow(clippy::too_many_arguments)]
1010fn hrr_and_scatter<'a>(
1011    la: usize,
1012    lb: usize,
1013    lc: usize,
1014    ld: usize,
1015    n_f: usize,
1016    c_ef: &[f64],
1017    ab: Vec3,
1018    cd: Vec3,
1019    out: &mut [f64],
1020    bra: &mut Vec<f64>,
1021    mut prev: &'a mut Vec<f64>,
1022    mut cur: &'a mut Vec<f64>,
1023    mut kprev: &'a mut Vec<f64>,
1024    mut kcur: &'a mut Vec<f64>,
1025    // Shared Cartesian-triple table (degrees `0..=2·MAX_L`); HRR indexes it up to
1026    // `max(ne, nf)`. Passed in so it is not re-`collect`ed per quartet.
1027    tri: &[Vec<[usize; 3]>],
1028) {
1029    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
1030    let ne = la + lb; // max bra (A-side) degree present in c_ef
1031    let nf = lc + ld; // max ket (C-side) degree present in c_ef
1032    let n_e = n_addr(ne);
1033
1034    // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
1035    // the ket base are keyed by the global `addr(f)` offset by `f_base`.
1036    let f_base = tri_below(lc);
1037    let nf_range = n_f - f_base;
1038
1039    // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
1040    // Reused arena buffers; bra is fully overwritten below, the rolling
1041    // layers in the region read — debug NaN-fill bra so a missed write surfaces.
1042    let bra_len = na * nb * nf_range;
1043    ensure_len(bra, bra_len);
1044    #[cfg(debug_assertions)]
1045    bra[..bra_len].fill(f64::NAN);
1046    // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
1047    let layer_len = n_cart(lb) * n_e;
1048    ensure_len(prev, layer_len);
1049    ensure_len(cur, layer_len);
1050    for f_global in f_base..n_f {
1051        let jf = f_global - f_base;
1052        // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
1053        for &a in tri[la..=ne].iter().flatten() {
1054            let ae = addr(a);
1055            prev[ae] = c_ef[ae * n_f + f_global];
1056        }
1057        for kb in 1..=lb {
1058            for (ibw, &b) in tri[kb].iter().enumerate() {
1059                let i = lower_axis(b);
1060                let b1w = cart_index(dec(b, i));
1061                // a-degrees that can still reach (la, lb): [la..=ne−kb].
1062                for &a in tri[la..=(ne - kb)].iter().flatten() {
1063                    let ae = addr(a);
1064                    let a1e = addr(inc(a, i));
1065                    cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
1066                }
1067            }
1068            std::mem::swap(&mut prev, &mut cur);
1069        }
1070        // `prev` now holds b-degree lb (or the base when lb = 0): extract.
1071        for (ib, &b) in tri[lb].iter().enumerate() {
1072            let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
1073            for (ia, &a) in tri[la].iter().enumerate() {
1074                bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
1075            }
1076        }
1077    }
1078
1079    // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
1080    let klayer_len = n_cart(ld) * n_f;
1081    ensure_len(kprev, klayer_len);
1082    ensure_len(kcur, klayer_len);
1083    for ia in 0..na {
1084        for ib in 0..nb {
1085            let brarow = (ia * nb + ib) * nf_range;
1086            // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
1087            for &c in tri[lc..=nf].iter().flatten() {
1088                let ce = addr(c);
1089                kprev[ce] = bra[brarow + (ce - f_base)];
1090            }
1091            for kd in 1..=ld {
1092                for (idw, &d) in tri[kd].iter().enumerate() {
1093                    let j = lower_axis(d);
1094                    let d1w = cart_index(dec(d, j));
1095                    for &c in tri[lc..=(nf - kd)].iter().flatten() {
1096                        let ce = addr(c);
1097                        let c1e = addr(inc(c, j));
1098                        kcur[idw * n_f + ce] =
1099                            kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
1100                    }
1101                }
1102                std::mem::swap(&mut kprev, &mut kcur);
1103            }
1104            // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
1105            for (ic, &c) in tri[lc].iter().enumerate() {
1106                let ce = addr(c);
1107                for id in 0..nd {
1108                    out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
1109                }
1110            }
1111        }
1112    }
1113}
1114
1115// --- small helpers ---
1116
1117/// First nonzero axis of a triple (the lowering direction).
1118#[inline]
1119fn lower_axis(t: [usize; 3]) -> usize {
1120    if t[0] > 0 {
1121        0
1122    } else if t[1] > 0 {
1123        1
1124    } else {
1125        2
1126    }
1127}
1128
1129#[inline]
1130fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
1131    t[i] -= 1;
1132    t
1133}
1134
1135#[inline]
1136fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
1137    t[i] += 1;
1138    t
1139}
1140
1141#[inline]
1142fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
1143    [
1144        (a * ca[0] + b * cb[0]) / p,
1145        (a * ca[1] + b * cb[1]) / p,
1146        (a * ca[2] + b * cb[2]) / p,
1147    ]
1148}
1149
1150#[inline]
1151fn sub(u: Vec3, v: Vec3) -> Vec3 {
1152    [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
1153}
1154
1155#[inline]
1156fn dist2(u: Vec3, v: Vec3) -> f64 {
1157    norm2(sub(u, v))
1158}
1159
1160#[inline]
1161fn norm2(u: Vec3) -> f64 {
1162    u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
1163}
1164
1165#[cfg(test)]
1166mod tests {
1167    use super::*;
1168
1169    /// Single-primitive `ShellRef` helper.
1170    fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
1171        (center, l, [exp], [1.0])
1172    }
1173
1174    /// (ss|ss) over four unit s primitives must equal the closed form
1175    /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
1176    /// engine passes, so the two share a verified base case.
1177    #[test]
1178    fn ssss_matches_closed_form() {
1179        let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
1180        let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
1181        let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
1182        let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
1183        let mut out = [0.0; 1];
1184        coulomb_shell_into(
1185            ShellRef {
1186                center: ac,
1187                l: al,
1188                exps: &ae,
1189                coeffs: &acf,
1190            },
1191            ShellRef {
1192                center: bc,
1193                l: bl,
1194                exps: &be,
1195                coeffs: &bcf,
1196            },
1197            ShellRef {
1198                center: cc,
1199                l: cl,
1200                exps: &ce,
1201                coeffs: &ccf,
1202            },
1203            ShellRef {
1204                center: dc,
1205                l: dl,
1206                exps: &de,
1207                coeffs: &dcf,
1208            },
1209            &mut out,
1210        );
1211
1212        let p = 0.8 + 1.3;
1213        let q = 0.5 + 1.1;
1214        let pcen = combine(0.8, ac, 1.3, bc, p);
1215        let qcen = combine(0.5, cc, 1.1, dc, q);
1216        let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
1217        let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
1218        let rho = p * q / (p + q);
1219        let t = rho * dist2(pcen, qcen);
1220        let mut fm = [0.0; 1];
1221        boys_array(0, t, &mut fm);
1222        use std::f64::consts::PI;
1223        let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
1224        assert!(
1225            (out[0] - expect).abs() < 1e-14 * expect.abs(),
1226            "ssss {} vs {}",
1227            out[0],
1228            expect
1229        );
1230    }
1231
1232    use crate::os::Prim;
1233    use crate::rys::coulomb_into;
1234
1235    /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
1236    /// center)`, for cross-checking against the Rys engine.
1237    #[allow(clippy::too_many_arguments)]
1238    fn os_block(
1239        la: usize,
1240        ea: f64,
1241        ca: Vec3,
1242        lb: usize,
1243        eb: f64,
1244        cb: Vec3,
1245        lc: usize,
1246        recc: f64,
1247        ccc: Vec3,
1248        ld: usize,
1249        ed: f64,
1250        cdd: Vec3,
1251    ) -> Vec<f64> {
1252        let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
1253        let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
1254        let one = [1.0];
1255        coulomb_shell_into(
1256            ShellRef {
1257                center: ca,
1258                l: la,
1259                exps: &ea1,
1260                coeffs: &one,
1261            },
1262            ShellRef {
1263                center: cb,
1264                l: lb,
1265                exps: &eb1,
1266                coeffs: &one,
1267            },
1268            ShellRef {
1269                center: ccc,
1270                l: lc,
1271                exps: &ec1,
1272                coeffs: &one,
1273            },
1274            ShellRef {
1275                center: cdd,
1276                l: ld,
1277                exps: &ed1,
1278                coeffs: &one,
1279            },
1280            &mut out,
1281        );
1282        out
1283    }
1284
1285    /// OS/HGP must reproduce the Rys engine element-for-element on a single
1286    /// primitive quartet, across a sweep of angular momenta (including the
1287    /// bug-prone mixed-high-L and "all four different L on four centers" cases).
1288    #[test]
1289    fn matches_rys_engine_primitive_sweep() {
1290        let ca = [0.0, 0.0, 0.0];
1291        let cb = [0.5, -0.3, 0.2];
1292        let cc = [-0.4, 0.6, -0.1];
1293        let cd = [0.2, 0.4, 0.8];
1294        let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);
1295
1296        let quartets = [
1297            (0usize, 0usize, 0usize, 0usize),
1298            (1, 0, 0, 0),
1299            (0, 0, 1, 0),
1300            (1, 1, 0, 0),
1301            (1, 0, 1, 0),
1302            (1, 1, 1, 1),
1303            (2, 0, 0, 0),
1304            (2, 1, 0, 0),
1305            (2, 1, 2, 1),
1306            (0, 1, 2, 3), // four different L on four centers
1307            (2, 2, 3, 3), // (dd|ff) mixed high-L
1308            (3, 0, 0, 1),
1309            // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
1310            // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
1311            // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
1312            (4, 4, 4, 1), // l_total = 13
1313            (6, 6, 1, 0), // l_total = 13, two i-shells
1314            (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
1315        ];
1316        for (la, lb, lc, ld) in quartets {
1317            let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
1318            let mut rys = vec![0.0; os.len()];
1319            coulomb_into(
1320                Prim::new(ea, ca, la),
1321                Prim::new(eb, cb, lb),
1322                Prim::new(ec, cc, lc),
1323                Prim::new(ed, cd, ld),
1324                1.0,
1325                &mut rys,
1326            );
1327            assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
1328        }
1329    }
1330
1331    /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
1332    /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
1333    /// The atol floor absorbs benign near-cancellation on structurally tiny
1334    /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
1335    /// even though both engines are correct — their dominant elements agree to
1336    /// ~1e-12); the rtol catches any real divergence.
1337    fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
1338        const ATOL: f64 = 1e-11;
1339        const RTOL: f64 = 1e-10;
1340        for (o, r) in os.iter().zip(rys.iter()) {
1341            assert!(
1342                (o - r).abs() <= ATOL + RTOL * r.abs(),
1343                "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
1344                (o - r).abs()
1345            );
1346        }
1347    }
1348
1349    /// HGP early contraction (VRR per primitive, HRR once in contracted space)
1350    /// must equal the per-primitive Rys sum for a genuinely **contracted**
1351    /// quartet.
1352    ///
1353    /// Note: doing the HRR after contraction is a *performance*
1354    /// choice, **not** a correctness fork. The HRR operator is linear with
1355    /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
1356    /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
1357    /// and post-contraction HRR give the *same* answer. This remains a valuable
1358    /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
1359    /// coefficient, or a wrong HRR sign would all fail it), but it does not
1360    /// distinguish HRR *ordering*, because the two orders are algebraically equal.
1361    #[test]
1362    fn contracted_quartet_matches_rys_sum() {
1363        // Two p shells and a d shell, each with multiple primitives, on distinct
1364        // centres (so HRR shift vectors A−B, C−D are all non-trivial).
1365        let ca = [0.0, 0.0, 0.0];
1366        let cb = [0.6, -0.2, 0.1];
1367        let cc = [-0.3, 0.5, -0.2];
1368        let cd = [0.2, 0.3, 0.7];
1369        let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
1370        let ax = [1.4, 0.45];
1371        let acf = [0.6, 0.5];
1372        let bx = [0.9, 0.3];
1373        let bcf = [0.55, 0.5];
1374        let cx = [1.1, 0.4];
1375        let ccf = [0.7, 0.4];
1376        let dx = [0.8];
1377        let dcf = [1.0];
1378
1379        let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
1380        coulomb_shell_into(
1381            ShellRef {
1382                center: ca,
1383                l: la,
1384                exps: &ax,
1385                coeffs: &acf,
1386            },
1387            ShellRef {
1388                center: cb,
1389                l: lb,
1390                exps: &bx,
1391                coeffs: &bcf,
1392            },
1393            ShellRef {
1394                center: cc,
1395                l: lc,
1396                exps: &cx,
1397                coeffs: &ccf,
1398            },
1399            ShellRef {
1400                center: cd,
1401                l: ld,
1402                exps: &dx,
1403                coeffs: &dcf,
1404            },
1405            &mut os,
1406        );
1407
1408        // Reference: sum the Rys primitive engine over the quartet with the same
1409        // effective coefficients.
1410        let mut rys = vec![0.0; os.len()];
1411        for (&ea, &wa) in ax.iter().zip(acf.iter()) {
1412            for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
1413                for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
1414                    for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
1415                        coulomb_into(
1416                            Prim::new(ea, ca, la),
1417                            Prim::new(eb, cb, lb),
1418                            Prim::new(ec, cc, lc),
1419                            Prim::new(ed, cd, ld),
1420                            wa * wb * wc * wd,
1421                            &mut rys,
1422                        );
1423                    }
1424                }
1425            }
1426        }
1427
1428        for (o, r) in os.iter().zip(rys.iter()) {
1429            assert!(
1430                (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
1431                "contracted OS vs Rys mismatch: {o} vs {r}"
1432            );
1433        }
1434    }
1435
1436    /// The triple-address map must be a bijection onto `0..n_addr(L)`.
1437    #[test]
1438    fn addr_is_bijective() {
1439        for lmax in 0..=6 {
1440            let mut seen = vec![false; n_addr(lmax)];
1441            for n in 0..=lmax {
1442                for t in cart_components(n) {
1443                    let a = addr(t);
1444                    assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
1445                    assert!(!seen[a], "addr collision at {t:?}");
1446                    seen[a] = true;
1447                }
1448            }
1449            assert!(
1450                seen.iter().all(|&x| x),
1451                "addr not surjective at lmax {lmax}"
1452            );
1453        }
1454    }
1455}