Skip to main content

integral_core/
os_eri.rs

1//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
2//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
3//!
4//! A **vertical recurrence (VRR)** builds the
5//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
6//! the primitives are **contracted** into AO space, and a **horizontal recurrence
7//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
8//! Doing the HRR after contraction is the HGP "early contraction" trick: the
9//! geometry-only HRR runs **once per shell quartet** instead of once per
10//! primitive quartet, which is the win at high contraction degree.
11//!
12//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
13//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
14//! against it (and against an independent McMurchie–Davidson path).
15//!
16//! ## Method
17//!
18//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
19//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
20//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
21//!
22//! ```text
23//!   [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
24//! ```
25//!
26//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
27//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
28//!
29//! ```text
30//!   [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
31//!       + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
32//!       + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
33//!   [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
34//!       + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
35//!       + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
36//! ```
37//!
38//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
39//! Cartesian shell block:
40//!
41//! ```text
42//!   (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0)   [bra, A→B]
43//!   (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d)   [ket, C→D]
44//! ```
45//!
46//! ## Buffers
47//!
48//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
49//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
50//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
51//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
52//! `(ii|ii)`. The current engine removes that:
53//!
54//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
55//!   `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
56//!   of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
57//!   index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
58//!   `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
59//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
60//!   / [`cart_index`] with two rolling degree layers, replacing the former HashMap
61//!   memoisation.
62//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
63//!   across quartets (thread-local by default), not re-allocated per quartet.
64//!
65//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
66//! former full-table engine's values, cross-checked against the Rys engine and an
67//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).
68
69use std::cell::RefCell;
70
71use integral_math::am::{cart_components, cart_index, n_cart};
72use integral_math::boys::{boys_array, boys_array2, boys_array4};
73
74use crate::os::{Vec3, MAX_L};
75
76/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
77/// momentum, and primitive `(exponent, effective-coefficient)` data. The
78/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
79/// engine itself works on un-normalised monomials and only multiplies the four
80/// coefficients into the contracted accumulator.
81#[derive(Debug, Clone, Copy)]
82pub struct ShellRef<'a> {
83    /// Shell centre (bohr).
84    pub center: Vec3,
85    /// Angular momentum.
86    pub l: usize,
87    /// Primitive exponents.
88    pub exps: &'a [f64],
89    /// Effective contraction coefficients, aligned with `exps`.
90    pub coeffs: &'a [f64],
91}
92
93/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
94#[inline]
95fn n_addr(lmax: usize) -> usize {
96    (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
97}
98
99/// Number of Cartesian triples of degree **strictly less than** `d`:
100/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
101/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
102#[inline]
103fn tri_below(d: usize) -> usize {
104    d * (d + 1) * (d + 2) / 6
105}
106
107/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
108/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
109/// `0..n_addr(deg)`.
110#[inline]
111fn addr(t: [usize; 3]) -> usize {
112    let n = t[0] + t[1] + t[2];
113    let base = n * (n + 1) * (n + 2) / 6;
114    // within-degree index: (n-lx)(n-lx+1)/2 + lz
115    let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
116    base + within
117}
118
119/// Reusable scratch arena for the OS/HGP ERI engine.
120///
121/// Holds the engine's working buffers so they are allocated **once and reused**
122/// across shell quartets, instead of a fresh heap allocation per quartet inside the
123/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
124/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
125/// thread uses its own instance — the thread-local default behind
126/// [`coulomb_shell_into`], or one passed explicitly to
127/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
128/// error).
129///
130/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
131/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
132/// they are later read, so they need no functional zeroing — but in debug builds
133/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
134/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
135/// rather than silently reading a stale value from a previous quartet. Results are
136/// therefore independent of evaluation order and of arena reuse (asserted by
137/// `tests/arena.rs`).
138#[derive(Debug, Default, Clone)]
139pub struct EriScratch {
140    /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
141    levels: Vec<f64>,
142    /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
143    c_ef: Vec<f64>,
144    /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
145    bra: Vec<f64>,
146    /// HRR bra rolling `b`-degree layers.
147    bra_prev: Vec<f64>,
148    bra_cur: Vec<f64>,
149    /// HRR ket rolling `d`-degree layers.
150    ket_prev: Vec<f64>,
151    ket_cur: Vec<f64>,
152    /// Precomputed bra/ket primitive-pair data (combined exponent, product
153    /// centre, `K` prefactor, coeff product) — built once per shell quartet so
154    /// the per-pair `exp`/centre work is not repeated inside the deep primitive
155    /// loop. Reused across quartets like the other buffers.
156    bra_pairs: Vec<PrimPair>,
157    ket_pairs: Vec<PrimPair>,
158    /// Shared Cartesian-triple table `tri_all[d] = cart_components(d)` for every
159    /// degree the engine needs (`0..=2·MAX_L`). Built once and sliced for the VRR
160    /// `e`/`f` lists and the HRR triples, instead of re-`collect`ing per quartet.
161    tri_all: Vec<Vec<[usize; 3]>>,
162    /// Flat VRR offset table: `eoff[k·n_e + ae]` is `e`'s packed offset in level
163    /// `k`. Rebuilt per quartet into this reused buffer (no per-quartet `Vec<Vec>`
164    /// allocation); `slab[k]` is one `f`-component's packed size for level `k`.
165    eoff: Vec<usize>,
166    slab: Vec<usize>,
167}
168
169/// One primitive pair's reusable geometry/prefactor, shared by every primitive
170/// quartet that uses it: the combined exponent `zeta = α+β`, the Gaussian product
171/// centre, the overlap prefactor `K = exp(−αβ/ζ·|A−B|²)`, and the two contraction
172/// coefficients (`c1` outer, `c2` inner).
173///
174/// The coefficients are kept **un-multiplied** so the contracted `scale` can be
175/// formed in exactly the former evaluation order `((c_a·c_b)·c_c)·c_d` — making the
176/// engine's output bit-identical to the pre-precomputation code (the per-element
177/// and 8-fold-symmetry tests are floorless-relative and sensitive to the last ULP
178/// on near-cancellation elements).
179#[derive(Debug, Clone, Copy, Default)]
180struct PrimPair {
181    zeta: f64,
182    center: Vec3,
183    kappa: f64,
184    c1: f64,
185    c2: f64,
186    /// `1/(2ζ)` — the VRR `inv_2p`(bra)/`inv_2q`(ket) coefficient. Pure per-pair.
187    inv_2zeta: f64,
188    /// `centre − s1.centre` — the VRR `P−A`(bra)/`Q−C`(ket) shift. Pure per-pair.
189    r1: Vec3,
190}
191
192/// A primitive pair whose weighted overlap prefactor `K·|c₁·c₂|` falls below
193/// this bound contributes at most ~`1e-24` (absolutely) to any tensor element —
194/// roughly twelve orders below the engine's `1e-12` accuracy bar — so it is
195/// dropped from the pair list before the quartet loop. Every `[e0|f0]^(m)` term
196/// the pair could produce is bounded by `K·|c₁·c₂|` times the other pair's
197/// weight (≤ its own bound), the `2π^{5/2}/(pq√(p+q))` prefactor, `F_m ≤ 1`,
198/// and the VRR's geometric raise factors; the product stays ≤ ~1e-24 for any
199/// chemically sensible exponents/geometry. The deeply contracted cross-centre
200/// pairs this removes have `K` underflowing to `0.0` outright in typical bases
201/// (tight×tight pairs at bonding distances reach `exp(-10⁴)`), where skipping
202/// is *exactly* lossless.
203const PAIR_NEGLIGIBLE: f64 = 1e-32;
204
205/// Fill `out` with the [`PrimPair`] data for every primitive combination of two
206/// shells (outer `s1`, inner `s2`), matching the former in-loop computation
207/// element-for-element. The inter-centre distance is hoisted out of the pair loop.
208/// `inv_2zeta` and `r1` are the pair-only VRR coefficients (`1/2ζ` and the `P−A` /
209/// `Q−C` shift) that the kernel formerly recomputed on every primitive quartet.
210/// Pairs below [`PAIR_NEGLIGIBLE`] are omitted; the surviving pairs keep their
211/// order, so the contraction accumulates the remaining terms in the same
212/// sequence as before.
213fn build_pairs(out: &mut Vec<PrimPair>, s1: ShellRef<'_>, s2: ShellRef<'_>) {
214    out.clear();
215    out.reserve(s1.exps.len() * s2.exps.len());
216    let d2 = dist2(s1.center, s2.center);
217    for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
218        for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
219            let zeta = e1 + e2;
220            let kappa = (-(e1 * e2 / zeta) * d2).exp();
221            if kappa * (c1 * c2).abs() < PAIR_NEGLIGIBLE {
222                continue;
223            }
224            let center = combine(e1, s1.center, e2, s2.center, zeta);
225            out.push(PrimPair {
226                zeta,
227                center,
228                kappa,
229                c1,
230                c2,
231                inv_2zeta: 0.5 / zeta,
232                r1: sub(center, s1.center),
233            });
234        }
235    }
236}
237
238/// Precomputed, screened [`PrimPair`] list of one **ordered** shell pair —
239/// libcint's pair-data ("optimizer") precompute. A dense driver builds one per
240/// canonical shell pair once per build (`O(n_shells²)`, trivial memory) and
241/// passes borrowed lists to [`coulomb_shell_pairs_into_scratch`] /
242/// [`coulomb_shell_batch4_pairs_into_scratch`], instead of the engine rebuilding
243/// the same pairs on every quartet that shares the pair.
244///
245/// The contents and order are exactly what the self-building entries compute
246/// internally ([`build_pairs`]), so routing through borrowed lists is
247/// bit-identical. **Orientation matters**: `build_pairs(s1, s2)` is
248/// order-sensitive (`r1 = P − A` uses `s1`'s centre; `c1`/`c2` keep the
249/// `s1`-outer/`s2`-inner coefficient association), so a list built for `(i, j)`
250/// must only be used where the engine would have built `(i, j)` — the canonical
251/// drivers use the `i ≥ j` orientation for both bra and ket throughout.
252#[derive(Debug, Clone, Default)]
253pub struct ShellPairData {
254    pairs: Vec<PrimPair>,
255}
256
257impl ShellPairData {
258    /// Number of surviving primitive pairs (the [`PAIR_NEGLIGIBLE`] screen
259    /// applied) — what [`surviving_pair_count`] returns for the same pair.
260    #[must_use]
261    pub fn len(&self) -> usize {
262        self.pairs.len()
263    }
264
265    /// `true` when no primitive pair survives the negligibility screen.
266    #[must_use]
267    pub fn is_empty(&self) -> bool {
268        self.pairs.is_empty()
269    }
270}
271
272/// Build the screened pair list for the **ordered** shell pair `(s1, s2)` —
273/// element-for-element what the self-building engine entries compute per
274/// quartet (see [`ShellPairData`] for the orientation contract).
275#[must_use]
276pub fn shell_pair_data(s1: ShellRef<'_>, s2: ShellRef<'_>) -> ShellPairData {
277    let mut pairs = Vec::new();
278    build_pairs(&mut pairs, s1, s2);
279    ShellPairData { pairs }
280}
281
282impl EriScratch {
283    /// A fresh, empty arena; it grows to fit on first use.
284    #[must_use]
285    pub fn new() -> Self {
286        Self::default()
287    }
288
289    /// Total `f64` elements currently held across all buffers — the resident
290    /// working set, for memory reporting/tests.
291    #[must_use]
292    pub fn resident_f64(&self) -> usize {
293        self.levels.len()
294            + self.c_ef.len()
295            + self.bra.len()
296            + self.bra_prev.len()
297            + self.bra_cur.len()
298            + self.ket_prev.len()
299            + self.ket_cur.len()
300    }
301
302    /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
303    /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
304    /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
305    #[must_use]
306    pub fn largest_buffer_f64(&self) -> usize {
307        [
308            self.levels.len(),
309            self.c_ef.len(),
310            self.bra.len(),
311            self.bra_prev.len(),
312            self.bra_cur.len(),
313            self.ket_prev.len(),
314            self.ket_cur.len(),
315        ]
316        .into_iter()
317        .max()
318        .unwrap_or(0)
319    }
320}
321
322thread_local! {
323    /// Per-thread default arena backing [`coulomb_shell_into`].
324    static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
325}
326
327/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
328#[inline]
329fn ensure_len(v: &mut Vec<f64>, n: usize) {
330    if v.len() < n {
331        v.resize(n, 0.0);
332    }
333}
334
335/// `ensure_len` for the `usize` offset/slab buffers.
336#[inline]
337fn ensure_usize(v: &mut Vec<usize>, n: usize) {
338    if v.len() < n {
339        v.resize(n, 0);
340    }
341}
342
343/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
344/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
345/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
346///
347/// The engine owns the primitive-quartet loop (the HGP contraction happens between
348/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
349/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
350/// explicit control (e.g. one arena per worker thread) call
351/// [`coulomb_shell_into_scratch`].
352///
353/// # Panics
354/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
355pub fn coulomb_shell_into(
356    a: ShellRef<'_>,
357    b: ShellRef<'_>,
358    c: ShellRef<'_>,
359    d: ShellRef<'_>,
360    out: &mut [f64],
361) {
362    ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
363}
364
365/// Like [`coulomb_shell_into`] (thread-local arena) but with the bra/ket
366/// primitive-pair data supplied by the caller — the borrowed-pairs analogue,
367/// see [`ShellPairData`]. Bit-identical to [`coulomb_shell_into`].
368///
369/// # Panics
370/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
371pub fn coulomb_shell_pairs_into(
372    a: ShellRef<'_>,
373    b: ShellRef<'_>,
374    c: ShellRef<'_>,
375    d: ShellRef<'_>,
376    bra_pairs: &ShellPairData,
377    ket_pairs: &ShellPairData,
378    out: &mut [f64],
379) {
380    ERI_SCRATCH.with(|s| {
381        coulomb_shell_pairs_into_scratch(
382            &mut s.borrow_mut(),
383            a,
384            b,
385            c,
386            d,
387            bra_pairs,
388            ket_pairs,
389            out,
390        );
391    });
392}
393
394/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
395/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
396/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
397/// compile error); the result is bit-identical regardless of which arena is used or
398/// what it last held.
399///
400/// # Panics
401/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
402pub fn coulomb_shell_into_scratch(
403    scratch: &mut EriScratch,
404    a: ShellRef<'_>,
405    b: ShellRef<'_>,
406    c: ShellRef<'_>,
407    d: ShellRef<'_>,
408    out: &mut [f64],
409) {
410    // Build the pair lists into the arena's reused buffers, then run the core
411    // on borrowed slices. The vecs are moved out for the duration of the call
412    // (a pointer swap) so the core can take `&mut` of the rest of the arena.
413    let mut bra_pairs = std::mem::take(&mut scratch.bra_pairs);
414    let mut ket_pairs = std::mem::take(&mut scratch.ket_pairs);
415    build_pairs(&mut bra_pairs, a, b);
416    build_pairs(&mut ket_pairs, c, d);
417    coulomb_shell_core(scratch, a, b, c, d, &bra_pairs, &ket_pairs, out);
418    scratch.bra_pairs = bra_pairs;
419    scratch.ket_pairs = ket_pairs;
420}
421
422/// Like [`coulomb_shell_into_scratch`] but with the bra/ket primitive-pair data
423/// supplied by the caller (precomputed once per shell pair across a dense
424/// build, see [`ShellPairData`]), instead of rebuilt per quartet. Bit-identical:
425/// the pair values and their order are exactly what the self-building entry
426/// computes — only *where* they are computed moves.
427///
428/// # Panics
429/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
430#[allow(clippy::too_many_arguments)]
431pub fn coulomb_shell_pairs_into_scratch(
432    scratch: &mut EriScratch,
433    a: ShellRef<'_>,
434    b: ShellRef<'_>,
435    c: ShellRef<'_>,
436    d: ShellRef<'_>,
437    bra_pairs: &ShellPairData,
438    ket_pairs: &ShellPairData,
439    out: &mut [f64],
440) {
441    coulomb_shell_core(scratch, a, b, c, d, &bra_pairs.pairs, &ket_pairs.pairs, out);
442}
443
444/// The quartet kernel shared by the self-building and borrowed-pairs entries:
445/// fast path / VRR dispatch / HRR over caller-provided pair slices.
446#[allow(clippy::too_many_arguments)]
447fn coulomb_shell_core(
448    scratch: &mut EriScratch,
449    a: ShellRef<'_>,
450    b: ShellRef<'_>,
451    c: ShellRef<'_>,
452    d: ShellRef<'_>,
453    bra_pairs: &[PrimPair],
454    ket_pairs: &[PrimPair],
455    out: &mut [f64],
456) {
457    let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
458    debug_assert!(
459        la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
460        "angular momentum exceeds MAX_L"
461    );
462    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
463    debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");
464
465    let ne = la + lb; // max bra (A-side) degree built by VRR
466    let nf = lc + ld; // max ket (C-side) degree built by VRR
467    let l_total = ne + nf;
468    let n_e = n_addr(ne);
469    let n_f = n_addr(nf);
470
471    let EriScratch {
472        levels,
473        c_ef,
474        bra,
475        bra_prev,
476        bra_cur,
477        ket_prev,
478        ket_cur,
479        tri_all,
480        eoff,
481        slab,
482        ..
483    } = scratch;
484
485    // Fast path: (ss|ss). With no angular momentum to raise, the whole quartet is a
486    // single contracted [00|00]^(0). Skip the VRR/HRR machinery (the dead raise
487    // coefficients, the `levels` buffer round-trip, the per-primitive call) and sum
488    // straight into a register. Bit-identical to the general path: same `pref`/`t`
489    // expressions, same `((c_a·c_b)·c_c)·c_d` scale, same `scale·(pref·F_0)` term,
490    // same bra-outer/ket-inner order — just without the scaffolding.
491    if l_total == 0 {
492        let two_pi_2_5 =
493            2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
494        let mut acc = 0.0;
495        let mut f0 = [0.0f64; 1];
496        let mut f1 = [0.0f64; 1];
497        for bra in bra_pairs.iter() {
498            let bc = bra.c1 * bra.c2;
499            let p = bra.zeta;
500            // Kets in PAIRS through the interleaved `boys_array2` (per-lane
501            // arithmetic and the lane-0-then-lane-1 accumulation order are
502            // exactly the former sequential loop's, so this is bit-identical).
503            let mut kets = ket_pairs.chunks_exact(2);
504            for pair in kets.by_ref() {
505                let (k0, k1) = (&pair[0], &pair[1]);
506                let pq0 = p + k0.zeta;
507                let pq1 = p + k1.zeta;
508                let t0 = (p * k0.zeta / pq0) * dist2(bra.center, k0.center);
509                let t1 = (p * k1.zeta / pq1) * dist2(bra.center, k1.center);
510                let pref0 = two_pi_2_5 / (p * k0.zeta * pq0.sqrt()) * bra.kappa * k0.kappa;
511                let pref1 = two_pi_2_5 / (p * k1.zeta * pq1.sqrt()) * bra.kappa * k1.kappa;
512                boys_array2(0, t0, t1, &mut f0, &mut f1);
513                acc += (bc * k0.c1) * k0.c2 * (pref0 * f0[0]);
514                acc += (bc * k1.c1) * k1.c2 * (pref1 * f1[0]);
515            }
516            for ket in kets.remainder() {
517                let q = ket.zeta;
518                let pq = p + q;
519                let t = (p * q / pq) * dist2(bra.center, ket.center);
520                let pref = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
521                boys_array(0, t, &mut f0);
522                acc += (bc * ket.c1) * ket.c2 * (pref * f0[0]);
523            }
524        }
525        out[0] += acc;
526        return;
527    }
528
529    // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
530    ensure_len(c_ef, n_e * n_f);
531    c_ef[..n_e * n_f].fill(0.0);
532
533    // Shared Cartesian-triple table, built once (degrees `0..=2·MAX_L` cover every
534    // `ne`/`nf`/`max(ne,nf)` the engine reaches) and sliced below — no per-quartet
535    // `cart_components` allocation.
536    if tri_all.len() <= 2 * MAX_L {
537        *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
538    }
539
540    // VRR. The small classes (`ne, nf ≤ 3` — every quartet shape of an
541    // spd basis except the d,d-bra/ket pairs, ~96% of a cc-pVDZ-style build)
542    // dispatch to the monomorphized kernels, where the whole structural walk
543    // (triples, addresses, branches, m-ranges) is resolved at compile time;
544    // everything else runs the general m-marching `vrr_primitive`.
545    // `(0,0)` already returned through the (ss|ss) fast path above.
546    match (ne, nf) {
547        (0, 1) => contract_class::<0, 1>(bra_pairs, ket_pairs, c_ef),
548        (1, 0) => contract_class::<1, 0>(bra_pairs, ket_pairs, c_ef),
549        (1, 1) => contract_class::<1, 1>(bra_pairs, ket_pairs, c_ef),
550        (0, 2) => contract_class::<0, 2>(bra_pairs, ket_pairs, c_ef),
551        (2, 0) => contract_class::<2, 0>(bra_pairs, ket_pairs, c_ef),
552        (1, 2) => contract_class::<1, 2>(bra_pairs, ket_pairs, c_ef),
553        (2, 1) => contract_class::<2, 1>(bra_pairs, ket_pairs, c_ef),
554        (2, 2) => contract_class::<2, 2>(bra_pairs, ket_pairs, c_ef),
555        (0, 3) => contract_class::<0, 3>(bra_pairs, ket_pairs, c_ef),
556        (3, 0) => contract_class::<3, 0>(bra_pairs, ket_pairs, c_ef),
557        (1, 3) => contract_class::<1, 3>(bra_pairs, ket_pairs, c_ef),
558        (3, 1) => contract_class::<3, 1>(bra_pairs, ket_pairs, c_ef),
559        (2, 3) => contract_class::<2, 3>(bra_pairs, ket_pairs, c_ef),
560        (3, 2) => contract_class::<3, 2>(bra_pairs, ket_pairs, c_ef),
561        (3, 3) => contract_class::<3, 3>(bra_pairs, ket_pairs, c_ef),
562        _ => {
563            // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
564            // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
565            // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
566            // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
567            // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
568            // packed size and `eoff[k·n_e + ae]` is `e`'s offset within it (flat, reused).
569            ensure_usize(slab, nf + 1);
570            ensure_usize(eoff, (nf + 1) * n_e);
571            // `k` indexes both `slab` and the flat `eoff` block base `k·n_e`, so a range
572            // loop is the clear form here.
573            #[allow(clippy::needless_range_loop)]
574            for k in 0..=nf {
575                let base = k * n_e;
576                let mut run = 0usize;
577                let mut ae = 0usize;
578                for de in 0..=ne {
579                    let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
580                    for _ in 0..n_cart(de) {
581                        eoff[base + ae] = run;
582                        ae += 1;
583                        run += mlen;
584                    }
585                }
586                slab[k] = run;
587            }
588            let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
589            ensure_len(levels, 3 * maxlevel);
590            // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
591            // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
592            // instead of silently using a previous quartet's value.
593            #[cfg(debug_assertions)]
594            levels[..3 * maxlevel].fill(f64::NAN);
595
596            // Reborrow the offset/triple buffers as shared slices for the read-only kernel.
597            let (eoff_s, slab_s, tri_s) = (&eoff[..], &slab[..], &tri_all[..]);
598            for bra in bra_pairs.iter() {
599                let bra_coef = bra.c1 * bra.c2; // = c_a·c_b, hoisted out of the ket loop
600                for ket in ket_pairs.iter() {
601                    // ((c_a·c_b)·c_c)·c_d — the former left-to-right product, bit for bit.
602                    let scale = (bra_coef * ket.c1) * ket.c2;
603                    vrr_primitive(
604                        bra.zeta,
605                        ket.zeta,
606                        bra.center,
607                        ket.center,
608                        bra.kappa,
609                        ket.kappa,
610                        bra.r1,
611                        ket.r1,
612                        bra.inv_2zeta,
613                        ket.inv_2zeta,
614                        ne,
615                        nf,
616                        l_total,
617                        n_e,
618                        n_f,
619                        maxlevel,
620                        eoff_s,
621                        slab_s,
622                        tri_s,
623                        levels,
624                        scale,
625                        c_ef,
626                    );
627                }
628            }
629        }
630    }
631
632    // HRR in contracted space, then scatter the block.
633    let ab = sub(a.center, b.center); // A − B
634    let cd = sub(c.center, d.center); // C − D
635    hrr_and_scatter(
636        la,
637        lb,
638        lc,
639        ld,
640        n_f,
641        c_ef,
642        ab,
643        cd,
644        out,
645        bra,
646        bra_prev,
647        bra_cur,
648        ket_prev,
649        ket_cur,
650        &tri_all[..],
651    );
652}
653
654/// Cartesian triples of degree 1, 2 and 3 in [`cart_components`] order, as `const`
655/// tables so the monomorphized small-class VRR kernels fully unroll over them
656/// (the structural walk — axes, addresses, `has2`/cross branches — then resolves
657/// at compile time instead of being re-derived on every primitive quartet).
658const TRI1: [[usize; 3]; 3] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]];
659const TRI2: [[usize; 3]; 6] = [
660    [2, 0, 0],
661    [1, 1, 0],
662    [1, 0, 1],
663    [0, 2, 0],
664    [0, 1, 1],
665    [0, 0, 2],
666];
667const TRI3: [[usize; 3]; 10] = [
668    [3, 0, 0],
669    [2, 1, 0],
670    [2, 0, 1],
671    [1, 2, 0],
672    [1, 1, 1],
673    [1, 0, 2],
674    [0, 3, 0],
675    [0, 2, 1],
676    [0, 1, 2],
677    [0, 0, 3],
678];
679/// The e-components `0..n_addr(3)` in [`addr`] order with their degree —
680/// the bra index set of the small-class kernels (`NE ≤ 3`).
681const E_COMP: [([usize; 3], usize); 20] = [
682    ([0, 0, 0], 0),
683    ([1, 0, 0], 1),
684    ([0, 1, 0], 1),
685    ([0, 0, 1], 1),
686    ([2, 0, 0], 2),
687    ([1, 1, 0], 2),
688    ([1, 0, 1], 2),
689    ([0, 2, 0], 2),
690    ([0, 1, 1], 2),
691    ([0, 0, 2], 2),
692    ([3, 0, 0], 3),
693    ([2, 1, 0], 3),
694    ([2, 0, 1], 3),
695    ([1, 2, 0], 3),
696    ([1, 1, 1], 3),
697    ([1, 0, 2], 3),
698    ([0, 3, 0], 3),
699    ([0, 2, 1], 3),
700    ([0, 1, 2], 3),
701    ([0, 0, 3], 3),
702];
703
704/// Per-primitive-quartet scalar header consumed by [`vrr_small`]: everything the
705/// kernel needs that depends on the *pair of pairs* (not on either pair alone).
706/// The expressions are the former `vrr_small` prologue, verbatim — only *where*
707/// they are evaluated moves (into the strip loop of [`contract_class`]).
708#[derive(Clone, Copy, Default)]
709struct QuartetHeader {
710    wp: [f64; 3], // W − P
711    wq: [f64; 3], // W − Q
712    pref: f64,
713    inv_2pq: f64,
714    q_over_pq: f64,
715    p_over_pq: f64,
716    scale: f64, // ((c_a·c_b)·c_c)·c_d
717    t: f64,     // Boys argument ρ·|P−Q|²
718}
719
720/// Ket-strip width of the strip-mined primitive loop in [`contract_class`].
721const STRIP: usize = 4;
722
723/// Contract a whole shell quartet of VRR shape `(NE, NF)` (with `NE, NF ≤ 3`)
724/// into `c_ef` using the monomorphized small-class kernel [`vrr_small`].
725///
726/// Same bra-outer / ket-inner primitive order and the same left-to-right
727/// `((c_a·c_b)·c_c)·c_d` scale as the general path, so the accumulation into
728/// `c_ef` is bit-identical. The ket loop is **strip-mined**: for a strip of up
729/// to [`STRIP`] ket pairs, first compute every scalar header (the serial
730/// sqrt/divide chain) plus its Boys row in one lane loop, then run the VRR
731/// body per lane — breaking the per-quartet header→Boys→VRR dependency chain
732/// so a lane's Boys/header work overlaps another lane's VRR in the pipeline.
733/// (A measured note: splitting Boys into its own third pass *regressed* —
734/// the extra `t` store/reload and loop overhead cost more than the split
735/// bought; the two-pass form is what wins.) Each lane's arithmetic
736/// (expressions, evaluation order, accumulation order into `c_ef`) is
737/// unchanged, so the result stays bit-identical.
738/// The VRR working tables live here (overwritten per primitive
739/// quartet in the region read, like the general path's `levels` arena) so
740/// their one-time zero-init is per shell quartet, not per primitive.
741fn contract_class<const NE: usize, const NF: usize>(
742    bra_pairs: &[PrimPair],
743    ket_pairs: &[PrimPair],
744    c_ef: &mut [f64],
745) {
746    // Worst-case (NE,NF)=(3,3) sizes: e-table 20 components × m ∈ 0..=6;
747    // f-degree-1 level 20×3 × m ∈ 0..=5; f-degree-2 level 20×6 × m ∈ 0..=4;
748    // f-degree-3 level 20×10 × m ∈ 0..=3.
749    let mut ea = [0.0f64; 140];
750    let mut eb1 = [0.0f64; 360];
751    let mut eb2 = [0.0f64; 600];
752    let mut eb3 = [0.0f64; 800];
753    let lt = NE + NF;
754    let mut hdr = [QuartetHeader::default(); STRIP];
755    let mut fm = [[0.0f64; 7]; STRIP];
756    for bra in bra_pairs {
757        let bra_coef = bra.c1 * bra.c2;
758        let (p, pc) = (bra.zeta, bra.center);
759        let mut start = 0;
760        while start < ket_pairs.len() {
761            let strip = &ket_pairs[start..(start + STRIP).min(ket_pairs.len())];
762            // Phase 1 — headers + Boys rows: identical expressions to
763            // `vrr_primitive`, independent iterations. Lanes are processed in
764            // PAIRS so the two Boys ladders evaluate through the manually
765            // interleaved [`boys_array2`] (two overlapping Horner/recurrence
766            // chains); each lane's own arithmetic is unchanged.
767            let header = |h: &mut QuartetHeader, ket: &PrimPair| {
768                let (q, qc) = (ket.zeta, ket.center);
769                let pq = p + q;
770                let rho = p * q / pq;
771                let pq_vec = sub(pc, qc); // P − Q
772                h.t = rho * norm2(pq_vec);
773                let w = [
774                    (p * pc[0] + q * qc[0]) / pq,
775                    (p * pc[1] + q * qc[1]) / pq,
776                    (p * pc[2] + q * qc[2]) / pq,
777                ];
778                h.wp = sub(w, pc);
779                h.wq = sub(w, qc);
780                use std::f64::consts::PI;
781                h.pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
782                h.inv_2pq = 0.5 / pq;
783                h.q_over_pq = q / pq;
784                h.p_over_pq = p / pq;
785                h.scale = (bra_coef * ket.c1) * ket.c2;
786            };
787            let len = strip.len();
788            let mut k = 0;
789            while k + 2 <= len {
790                header(&mut hdr[k], &strip[k]);
791                header(&mut hdr[k + 1], &strip[k + 1]);
792                let (f0, f1) = fm.split_at_mut(k + 1);
793                boys_array2(
794                    lt,
795                    hdr[k].t,
796                    hdr[k + 1].t,
797                    &mut f0[k][..=lt],
798                    &mut f1[0][..=lt],
799                );
800                k += 2;
801            }
802            if k < len {
803                header(&mut hdr[k], &strip[k]);
804                boys_array(lt, hdr[k].t, &mut fm[k][..=lt]);
805            }
806            // Phase 2 — VRR body per lane, fed from the precomputed header.
807            for (k, ket) in strip.iter().enumerate() {
808                vrr_small::<NE, NF>(
809                    bra, ket, &hdr[k], &fm[k], &mut ea, &mut eb1, &mut eb2, &mut eb3, c_ef,
810                );
811            }
812            start += STRIP;
813        }
814    }
815}
816
817/// One primitive quartet of the OS VRR for the small classes `NE, NF ≤ 3`,
818/// monomorphized per `(NE, NF)` — a per-L-class specialized kernel expressed
819/// through const generics: every loop bound, Cartesian triple, address, stride,
820/// and `has2`/cross branch below is a compile-time constant in each
821/// instantiation, so the emitted code is a straight unrolled FMA chain.
822///
823/// **Bit-identical to [`vrr_primitive`] by construction**: the same recurrence
824/// expressions with the same term order and association — base
825/// `pref·F_m`, raise `pa_i·s1[m] + wp_i·s1[m+1]`, then `+= coef2·(s2[m] −
826/// (q/pq)·s2[m+1])`, then `+= cross·s3[m+1]` — the same m-ranges
827/// (`m ≤ l_total − de − k`) for every *consumed* value, and the same
828/// `c_ef[ae·n_f + af] += scale·v` extraction. Two things differ, neither of
829/// which touches a kept value's arithmetic: the storage (fixed per-degree
830/// tables instead of the triangle-packed rolling window), and the final
831/// f-level computing only its consumed `m = 0` row (the general path also
832/// fills the dead `m > 0` triangle there).
833/// Guarded by the full-tensor XOR fingerprint and the golden/cross-engine tests.
834///
835/// Per-degree row strides: the m-rows of `ea` hold `m ∈ 0..=lt` (`sa = lt+1`),
836/// of `eb1` `m ∈ 0..=lt−1` (`s1 = lt`), of `eb2` `m ∈ 0..=lt−2`, of `eb3`
837/// `m ∈ 0..=lt−3` — each level's longest row (its degree-0 e-component).
838#[inline(always)]
839#[allow(clippy::too_many_arguments)]
840fn vrr_small<const NE: usize, const NF: usize>(
841    bra: &PrimPair,
842    ket: &PrimPair,
843    hdr: &QuartetHeader,
844    fm: &[f64; 7],
845    ea: &mut [f64; 140],
846    eb1: &mut [f64; 360],
847    eb2: &mut [f64; 600],
848    eb3: &mut [f64; 800],
849    c_ef: &mut [f64],
850) {
851    let lt = NE + NF;
852    let n_e = n_addr(NE);
853    let n_f = n_addr(NF);
854    // Row strides (compile-time constants per instantiation).
855    let sa = lt + 1;
856    let s1 = lt;
857    let s2 = lt.saturating_sub(1);
858    let s3 = lt.saturating_sub(2);
859    let pa = bra.r1; // P − A
860    let qcen = ket.r1; // Q − C
861    let (inv_2p, inv_2q) = (bra.inv_2zeta, ket.inv_2zeta);
862
863    // Scalar header + Boys row: precomputed by `contract_class` (phases 1–2).
864    let QuartetHeader {
865        wp,
866        wq,
867        pref,
868        inv_2pq,
869        q_over_pq,
870        p_over_pq,
871        scale,
872        t: _,
873    } = *hdr;
874    for m in 0..=lt {
875        ea[m] = pref * fm[m];
876    }
877
878    // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
879    if NE >= 1 {
880        for (iw, te) in TRI1.iter().enumerate() {
881            let i = lower_axis(*te);
882            // Source is [0,0,0] at addr 0; no has2 term at degree 1.
883            for m in 0..=(lt - 1) {
884                ea[(1 + iw) * sa + m] = pa[i] * ea[m] + wp[i] * ea[m + 1];
885            }
886        }
887    }
888    if NE >= 2 {
889        for (iw, te) in TRI2.iter().enumerate() {
890            let i = lower_axis(*te);
891            let s1a = addr(dec(*te, i));
892            let has2 = te[i] >= 2;
893            let coef2 = ((te[i] - 1) as f64) * inv_2p;
894            let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
895            for m in 0..=(lt - 2) {
896                let mut v = pa[i] * ea[s1a * sa + m] + wp[i] * ea[s1a * sa + m + 1];
897                if has2 {
898                    v += coef2 * (ea[s2a * sa + m] - q_over_pq * ea[s2a * sa + m + 1]);
899                }
900                ea[(4 + iw) * sa + m] = v;
901            }
902        }
903    }
904    if NE >= 3 {
905        for (iw, te) in TRI3.iter().enumerate() {
906            let i = lower_axis(*te);
907            let s1a = addr(dec(*te, i));
908            let has2 = te[i] >= 2;
909            let coef2 = ((te[i] - 1) as f64) * inv_2p;
910            let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
911            for m in 0..=(lt - 3) {
912                let mut v = pa[i] * ea[s1a * sa + m] + wp[i] * ea[s1a * sa + m + 1];
913                if has2 {
914                    v += coef2 * (ea[s2a * sa + m] - q_over_pq * ea[s2a * sa + m + 1]);
915                }
916                ea[(10 + iw) * sa + m] = v;
917            }
918        }
919    }
920    // Extract f-degree 0: contract m = 0 of every e-component.
921    for ae in 0..n_e {
922        c_ef[ae * n_f] += scale * ea[ae * sa];
923    }
924
925    // Phase B — ket raise, f-degree k: backward liveness gives every f-level-k
926    // row a needed m-range of `m ≤ NF − k`, independent of the e-degree: the
927    // final level (k = NF) only feeds the m = 0 extraction, and each level
928    // below it is read at m, m+1 by the next level's raise (its cross and the
929    // k+2 has2 reach no further). The general path's full triangle
930    // (`m ≤ lt − de − k`) computes `NE − de` dead values per row beyond that;
931    // skipping them removes whole computations and leaves every kept value
932    // bit-identical (fingerprint-guarded). Phase A has no such slack — its
933    // liveness works out to exactly the `lt − de` triangle it computes.
934    if NF >= 1 {
935        for (fw, tf) in TRI1.iter().enumerate() {
936            let j = lower_axis(*tf);
937            // Source f-component is [0,0,0] (level 0 = the phase-A table).
938            for ae in 0..n_e {
939                let (te, _) = E_COMP[ae];
940                let has_cross = te[j] >= 1;
941                let cross_coef = te[j] as f64 * inv_2pq;
942                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
943                let mtop = NF - 1;
944                for m in 0..=mtop {
945                    let mut v = qcen[j] * ea[ae * sa + m] + wq[j] * ea[ae * sa + m + 1];
946                    if has_cross {
947                        v += cross_coef * ea[cs * sa + m + 1];
948                    }
949                    eb1[(ae * 3 + fw) * s1 + m] = v;
950                }
951            }
952        }
953        for ae in 0..n_e {
954            for fw in 0..3 {
955                c_ef[ae * n_f + 1 + fw] += scale * eb1[(ae * 3 + fw) * s1];
956            }
957        }
958    }
959
960    // Phase B — ket raise, f-degree 2: m ≤ NF − 2 (liveness bound, see above).
961    if NF >= 2 {
962        for (fw, tf) in TRI2.iter().enumerate() {
963            let j = lower_axis(*tf);
964            let f1 = dec(*tf, j);
965            let lf1 = cart_index(f1); // f-component index in the degree-1 level
966            let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
967            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
968            for ae in 0..n_e {
969                let (te, _) = E_COMP[ae];
970                let has_cross = te[j] >= 1;
971                let cross_coef = te[j] as f64 * inv_2pq;
972                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
973                let mtop = NF - 2;
974                for m in 0..=mtop {
975                    let mut v = qcen[j] * eb1[(ae * 3 + lf1) * s1 + m]
976                        + wq[j] * eb1[(ae * 3 + lf1) * s1 + m + 1];
977                    if has2 {
978                        v += coef2 * (ea[ae * sa + m] - p_over_pq * ea[ae * sa + m + 1]);
979                    }
980                    if has_cross {
981                        v += cross_coef * eb1[(cs * 3 + lf1) * s1 + m + 1];
982                    }
983                    eb2[(ae * 6 + fw) * s2 + m] = v;
984                }
985            }
986        }
987        for ae in 0..n_e {
988            for fw in 0..6 {
989                c_ef[ae * n_f + 4 + fw] += scale * eb2[(ae * 6 + fw) * s2];
990            }
991        }
992    }
993
994    // Phase B — ket raise, f-degree 3: always the final f-level (NF ≤ 3), so
995    // only m = 0 is computed (same dead-row trim as above).
996    if NF >= 3 {
997        for (fw, tf) in TRI3.iter().enumerate() {
998            let j = lower_axis(*tf);
999            let f1 = dec(*tf, j);
1000            let lf1 = cart_index(f1); // f-component index in the degree-2 level
1001            let has2 = tf[j] >= 2; // its k−2 source is f1 lowered again (degree 1)
1002            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
1003            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 };
1004            for ae in 0..n_e {
1005                let (te, _) = E_COMP[ae];
1006                let has_cross = te[j] >= 1;
1007                let cross_coef = te[j] as f64 * inv_2pq;
1008                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
1009                let mtop = 0; // final level: m = 0 only
1010                for m in 0..=mtop {
1011                    let mut v = qcen[j] * eb2[(ae * 6 + lf1) * s2 + m]
1012                        + wq[j] * eb2[(ae * 6 + lf1) * s2 + m + 1];
1013                    if has2 {
1014                        v += coef2
1015                            * (eb1[(ae * 3 + lf2) * s1 + m]
1016                                - p_over_pq * eb1[(ae * 3 + lf2) * s1 + m + 1]);
1017                    }
1018                    if has_cross {
1019                        v += cross_coef * eb2[(cs * 6 + lf1) * s2 + m + 1];
1020                    }
1021                    eb3[(ae * 10 + fw) * s3 + m] = v;
1022                }
1023            }
1024        }
1025        for ae in 0..n_e {
1026            for fw in 0..10 {
1027                c_ef[ae * n_f + 10 + fw] += scale * eb3[(ae * 10 + fw) * s3];
1028            }
1029        }
1030    }
1031}
1032
1033// --- 4-lane batch across shell quartets (libint2-style class vectorization) ---
1034
1035/// Four-wide value: one shell quartet per lane. All lane arithmetic below is
1036/// elementwise, so each quartet's numbers never mix across lanes — the batch
1037/// path is bit-identical to four scalar evaluations by construction.
1038type V4 = [f64; 4];
1039
1040#[inline(always)]
1041fn v4_mul(a: V4, b: V4) -> V4 {
1042    [a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3]]
1043}
1044
1045#[inline(always)]
1046fn v4_add(a: V4, b: V4) -> V4 {
1047    [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
1048}
1049
1050#[inline(always)]
1051fn v4_sub(a: V4, b: V4) -> V4 {
1052    [a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3]]
1053}
1054
1055#[inline(always)]
1056fn v4_scale(s: f64, a: V4) -> V4 {
1057    [s * a[0], s * a[1], s * a[2], s * a[3]]
1058}
1059
1060/// Lane-major [`QuartetHeader`] for the 4-quartet batch kernel: the same scalar
1061/// header fields, one lane per quartet. Each lane is filled by the exact
1062/// expression sequence of the scalar header (see [`contract_class`]).
1063#[derive(Clone, Copy, Default)]
1064struct QuartetHeader4 {
1065    wp: [V4; 3],
1066    wq: [V4; 3],
1067    pref: V4,
1068    inv_2pq: V4,
1069    q_over_pq: V4,
1070    p_over_pq: V4,
1071    scale: V4,
1072    t: V4,
1073}
1074
1075/// Contract four shell quartets of the same VRR shape `(NE, NF)` (`NE, NF ≤ 3`)
1076/// in lockstep, one quartet per lane. Every lane must have the **same bra and
1077/// ket pair counts** (the caller buckets quartets that way), so the lanes run
1078/// the identical primitive loop with no padding — no `0·∞`/`-0.0` hazards.
1079///
1080/// Per lane the primitive order (bra outer, ket inner), header expressions,
1081/// Boys ladder ([`boys_array4`], lanes bitwise equal to [`boys_array`]) and the
1082/// VRR/accumulation arithmetic of [`vrr_small4`] are exactly the scalar
1083/// [`contract_class`]/[`vrr_small`] sequence, so each lane's `c_ef` is
1084/// bit-identical to a scalar evaluation of that quartet (fingerprint-guarded).
1085// `ib`/`ik` index four parallel pair slices in lockstep — a range loop is the
1086// clear form (no single iterable to zip without allocation).
1087#[allow(clippy::needless_range_loop)]
1088fn contract_class4<const NE: usize, const NF: usize>(
1089    bra_pairs: [&[PrimPair]; 4],
1090    ket_pairs: [&[PrimPair]; 4],
1091    c_ef: &mut [&mut [f64]; 4],
1092) {
1093    let lt = NE + NF;
1094    let n_bra = bra_pairs[0].len();
1095    let n_ket = ket_pairs[0].len();
1096    debug_assert!(bra_pairs.iter().all(|b| b.len() == n_bra));
1097    debug_assert!(ket_pairs.iter().all(|k| k.len() == n_ket));
1098    // Worst-case (NE,NF)=(3,3) sizes, the scalar `contract_class` tables widened
1099    // to V4 (~61 KB of stack total — fine on both the main thread and the
1100    // `EriBuilder` workers): e-table 20 components × m ∈ 0..=6; f-degree-1
1101    // level 20×3 × m ∈ 0..=5; f-degree-2 level 20×6 × m ∈ 0..=4; f-degree-3
1102    // level 20×10 × m ∈ 0..=3.
1103    let mut ea = [[0.0f64; 4]; 140];
1104    let mut eb1 = [[0.0f64; 4]; 360];
1105    let mut eb2 = [[0.0f64; 4]; 600];
1106    let mut eb3 = [[0.0f64; 4]; 800];
1107    let mut hdr = QuartetHeader4::default();
1108    let mut fm = [[0.0f64; 4]; 7];
1109    for ib in 0..n_bra {
1110        let bras = [
1111            &bra_pairs[0][ib],
1112            &bra_pairs[1][ib],
1113            &bra_pairs[2][ib],
1114            &bra_pairs[3][ib],
1115        ];
1116        let mut pa = [[0.0f64; 4]; 3];
1117        let mut inv_2p = [0.0f64; 4];
1118        let mut bra_coef = [0.0f64; 4];
1119        for (lane, bra) in bras.iter().enumerate() {
1120            for (dst, &src) in pa.iter_mut().zip(bra.r1.iter()) {
1121                dst[lane] = src;
1122            }
1123            inv_2p[lane] = bra.inv_2zeta;
1124            bra_coef[lane] = bra.c1 * bra.c2;
1125        }
1126        for ik in 0..n_ket {
1127            let kets = [
1128                &ket_pairs[0][ik],
1129                &ket_pairs[1][ik],
1130                &ket_pairs[2][ik],
1131                &ket_pairs[3][ik],
1132            ];
1133            let mut qcen = [[0.0f64; 4]; 3];
1134            let mut inv_2q = [0.0f64; 4];
1135            for (lane, (bra, ket)) in bras.iter().zip(kets.iter()).enumerate() {
1136                // The scalar header expressions of `contract_class`, verbatim,
1137                // one lane per quartet.
1138                let (p, pc) = (bra.zeta, bra.center);
1139                let (q, qc) = (ket.zeta, ket.center);
1140                let pq = p + q;
1141                let rho = p * q / pq;
1142                let pq_vec = sub(pc, qc); // P − Q
1143                hdr.t[lane] = rho * norm2(pq_vec);
1144                let w = [
1145                    (p * pc[0] + q * qc[0]) / pq,
1146                    (p * pc[1] + q * qc[1]) / pq,
1147                    (p * pc[2] + q * qc[2]) / pq,
1148                ];
1149                let wp = sub(w, pc);
1150                let wq = sub(w, qc);
1151                for ax in 0..3 {
1152                    hdr.wp[ax][lane] = wp[ax];
1153                    hdr.wq[ax][lane] = wq[ax];
1154                    qcen[ax][lane] = ket.r1[ax];
1155                }
1156                use std::f64::consts::PI;
1157                hdr.pref[lane] =
1158                    2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
1159                hdr.inv_2pq[lane] = 0.5 / pq;
1160                hdr.q_over_pq[lane] = q / pq;
1161                hdr.p_over_pq[lane] = p / pq;
1162                hdr.scale[lane] = (bra_coef[lane] * ket.c1) * ket.c2;
1163                inv_2q[lane] = ket.inv_2zeta;
1164            }
1165            boys_array4(lt, hdr.t, &mut fm[..=lt]);
1166            vrr_small4::<NE, NF>(
1167                &pa, &qcen, inv_2p, inv_2q, &hdr, &fm, &mut ea, &mut eb1, &mut eb2, &mut eb3, c_ef,
1168            );
1169        }
1170    }
1171}
1172
1173/// Four primitive quartets of the small-class OS VRR (`NE, NF ≤ 3`) in lockstep,
1174/// one quartet per [`V4`] lane — the 4-wide counterpart of [`vrr_small`].
1175///
1176/// **Bit-identical to [`vrr_small`] per lane by construction**: every statement
1177/// below is the scalar kernel's statement with each `f64` widened to a [`V4`]
1178/// whose lanes never interact — same expressions, same term order and
1179/// association, same m-ranges (the round-6/7 liveness trims included), and the
1180/// same per-lane `c_ef[ae·n_f + af] += scale·v` extraction order.
1181#[inline(always)]
1182#[allow(clippy::too_many_arguments)]
1183fn vrr_small4<const NE: usize, const NF: usize>(
1184    pa: &[V4; 3],
1185    qcen: &[V4; 3],
1186    inv_2p: V4,
1187    inv_2q: V4,
1188    hdr: &QuartetHeader4,
1189    fm: &[V4; 7],
1190    ea: &mut [V4; 140],
1191    eb1: &mut [V4; 360],
1192    eb2: &mut [V4; 600],
1193    eb3: &mut [V4; 800],
1194    c_ef: &mut [&mut [f64]; 4],
1195) {
1196    const { assert!(NE <= 3 && NF <= 3, "batch kernel covers NE, NF <= 3") };
1197    let lt = NE + NF;
1198    let n_e = n_addr(NE);
1199    let n_f = n_addr(NF);
1200    let sa = lt + 1;
1201    let s1 = lt;
1202    let s2 = lt.saturating_sub(1);
1203    let s3 = lt.saturating_sub(2);
1204    let QuartetHeader4 {
1205        wp,
1206        wq,
1207        pref,
1208        inv_2pq,
1209        q_over_pq,
1210        p_over_pq,
1211        scale,
1212        t: _,
1213    } = *hdr;
1214    for m in 0..=lt {
1215        ea[m] = v4_mul(pref, fm[m]);
1216    }
1217
1218    // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
1219    if NE >= 1 {
1220        for (iw, te) in TRI1.iter().enumerate() {
1221            let i = lower_axis(*te);
1222            for m in 0..=(lt - 1) {
1223                ea[(1 + iw) * sa + m] = v4_add(v4_mul(pa[i], ea[m]), v4_mul(wp[i], ea[m + 1]));
1224            }
1225        }
1226    }
1227    if NE >= 2 {
1228        for (iw, te) in TRI2.iter().enumerate() {
1229            let i = lower_axis(*te);
1230            let s1a = addr(dec(*te, i));
1231            let has2 = te[i] >= 2;
1232            let coef2 = v4_scale((te[i] - 1) as f64, inv_2p);
1233            let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
1234            for m in 0..=(lt - 2) {
1235                let mut v = v4_add(
1236                    v4_mul(pa[i], ea[s1a * sa + m]),
1237                    v4_mul(wp[i], ea[s1a * sa + m + 1]),
1238                );
1239                if has2 {
1240                    v = v4_add(
1241                        v,
1242                        v4_mul(
1243                            coef2,
1244                            v4_sub(ea[s2a * sa + m], v4_mul(q_over_pq, ea[s2a * sa + m + 1])),
1245                        ),
1246                    );
1247                }
1248                ea[(4 + iw) * sa + m] = v;
1249            }
1250        }
1251    }
1252    if NE >= 3 {
1253        for (iw, te) in TRI3.iter().enumerate() {
1254            let i = lower_axis(*te);
1255            let s1a = addr(dec(*te, i));
1256            let has2 = te[i] >= 2;
1257            let coef2 = v4_scale((te[i] - 1) as f64, inv_2p);
1258            let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
1259            for m in 0..=(lt - 3) {
1260                let mut v = v4_add(
1261                    v4_mul(pa[i], ea[s1a * sa + m]),
1262                    v4_mul(wp[i], ea[s1a * sa + m + 1]),
1263                );
1264                if has2 {
1265                    v = v4_add(
1266                        v,
1267                        v4_mul(
1268                            coef2,
1269                            v4_sub(ea[s2a * sa + m], v4_mul(q_over_pq, ea[s2a * sa + m + 1])),
1270                        ),
1271                    );
1272                }
1273                ea[(10 + iw) * sa + m] = v;
1274            }
1275        }
1276    }
1277    // Extract f-degree 0: contract m = 0 of every e-component, per lane.
1278    for ae in 0..n_e {
1279        let v = ea[ae * sa];
1280        for lane in 0..4 {
1281            c_ef[lane][ae * n_f] += scale[lane] * v[lane];
1282        }
1283    }
1284
1285    // Phase B — ket raise, f-degree k: liveness m-range `m ≤ NF − k` (see
1286    // [`vrr_small`]'s Phase B comment for the derivation).
1287    if NF >= 1 {
1288        for (fw, tf) in TRI1.iter().enumerate() {
1289            let j = lower_axis(*tf);
1290            for ae in 0..n_e {
1291                let (te, _) = E_COMP[ae];
1292                let has_cross = te[j] >= 1;
1293                let cross_coef = v4_scale(te[j] as f64, inv_2pq);
1294                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
1295                let mtop = NF - 1;
1296                for m in 0..=mtop {
1297                    let mut v = v4_add(
1298                        v4_mul(qcen[j], ea[ae * sa + m]),
1299                        v4_mul(wq[j], ea[ae * sa + m + 1]),
1300                    );
1301                    if has_cross {
1302                        v = v4_add(v, v4_mul(cross_coef, ea[cs * sa + m + 1]));
1303                    }
1304                    eb1[(ae * 3 + fw) * s1 + m] = v;
1305                }
1306            }
1307        }
1308        for ae in 0..n_e {
1309            for fw in 0..3 {
1310                let v = eb1[(ae * 3 + fw) * s1];
1311                for lane in 0..4 {
1312                    c_ef[lane][ae * n_f + 1 + fw] += scale[lane] * v[lane];
1313                }
1314            }
1315        }
1316    }
1317
1318    if NF >= 2 {
1319        for (fw, tf) in TRI2.iter().enumerate() {
1320            let j = lower_axis(*tf);
1321            let f1 = dec(*tf, j);
1322            let lf1 = cart_index(f1);
1323            let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
1324            let coef2 = v4_scale((tf[j] - 1) as f64, inv_2q);
1325            for ae in 0..n_e {
1326                let (te, _) = E_COMP[ae];
1327                let has_cross = te[j] >= 1;
1328                let cross_coef = v4_scale(te[j] as f64, inv_2pq);
1329                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
1330                let mtop = NF - 2;
1331                for m in 0..=mtop {
1332                    let mut v = v4_add(
1333                        v4_mul(qcen[j], eb1[(ae * 3 + lf1) * s1 + m]),
1334                        v4_mul(wq[j], eb1[(ae * 3 + lf1) * s1 + m + 1]),
1335                    );
1336                    if has2 {
1337                        v = v4_add(
1338                            v,
1339                            v4_mul(
1340                                coef2,
1341                                v4_sub(ea[ae * sa + m], v4_mul(p_over_pq, ea[ae * sa + m + 1])),
1342                            ),
1343                        );
1344                    }
1345                    if has_cross {
1346                        v = v4_add(v, v4_mul(cross_coef, eb1[(cs * 3 + lf1) * s1 + m + 1]));
1347                    }
1348                    eb2[(ae * 6 + fw) * s2 + m] = v;
1349                }
1350            }
1351        }
1352        for ae in 0..n_e {
1353            for fw in 0..6 {
1354                let v = eb2[(ae * 6 + fw) * s2];
1355                for lane in 0..4 {
1356                    c_ef[lane][ae * n_f + 4 + fw] += scale[lane] * v[lane];
1357                }
1358            }
1359        }
1360    }
1361
1362    // Phase B — ket raise, f-degree 3: always the final f-level (NF ≤ 3), so
1363    // only m = 0 is computed (the scalar `vrr_small` block, widened to V4).
1364    if NF >= 3 {
1365        for (fw, tf) in TRI3.iter().enumerate() {
1366            let j = lower_axis(*tf);
1367            let f1 = dec(*tf, j);
1368            let lf1 = cart_index(f1); // f-component index in the degree-2 level
1369            let has2 = tf[j] >= 2; // its k−2 source is f1 lowered again (degree 1)
1370            let coef2 = v4_scale((tf[j] - 1) as f64, inv_2q);
1371            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 };
1372            for ae in 0..n_e {
1373                let (te, _) = E_COMP[ae];
1374                let has_cross = te[j] >= 1;
1375                let cross_coef = v4_scale(te[j] as f64, inv_2pq);
1376                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
1377                let mtop = 0; // final level: m = 0 only
1378                for m in 0..=mtop {
1379                    let mut v = v4_add(
1380                        v4_mul(qcen[j], eb2[(ae * 6 + lf1) * s2 + m]),
1381                        v4_mul(wq[j], eb2[(ae * 6 + lf1) * s2 + m + 1]),
1382                    );
1383                    if has2 {
1384                        v = v4_add(
1385                            v,
1386                            v4_mul(
1387                                coef2,
1388                                v4_sub(
1389                                    eb1[(ae * 3 + lf2) * s1 + m],
1390                                    v4_mul(p_over_pq, eb1[(ae * 3 + lf2) * s1 + m + 1]),
1391                                ),
1392                            ),
1393                        );
1394                    }
1395                    if has_cross {
1396                        v = v4_add(v, v4_mul(cross_coef, eb2[(cs * 6 + lf1) * s2 + m + 1]));
1397                    }
1398                    eb3[(ae * 10 + fw) * s3 + m] = v;
1399                }
1400            }
1401        }
1402        for ae in 0..n_e {
1403            for fw in 0..10 {
1404                let v = eb3[(ae * 10 + fw) * s3];
1405                for lane in 0..4 {
1406                    c_ef[lane][ae * n_f + 10 + fw] += scale[lane] * v[lane];
1407                }
1408            }
1409        }
1410    }
1411}
1412
1413/// Four `(ss|ss)` shell quartets in lockstep — the 4-lane counterpart of the
1414/// scalar `(ss|ss)` fast path in [`coulomb_shell_into_scratch`]: per lane the
1415/// same `t`/`pref` expressions, the same `((c_a·c_b)·c_c)·c_d` scale, the same
1416/// `scale·(pref·F_0)` term and the same bra-outer/ket-inner accumulation order
1417/// into a single per-lane register, so each lane is bit-identical to the
1418/// scalar path. Lanes must have equal bra/ket pair counts (caller-bucketed).
1419#[allow(clippy::needless_range_loop)] // ib/ik index four parallel slices in lockstep
1420fn contract_ssss4(
1421    bra_pairs: [&[PrimPair]; 4],
1422    ket_pairs: [&[PrimPair]; 4],
1423    outs: &mut [&mut [f64]; 4],
1424) {
1425    let n_bra = bra_pairs[0].len();
1426    let n_ket = ket_pairs[0].len();
1427    debug_assert!(bra_pairs.iter().all(|b| b.len() == n_bra));
1428    debug_assert!(ket_pairs.iter().all(|k| k.len() == n_ket));
1429    let two_pi_2_5 =
1430        2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
1431    let mut acc = [0.0f64; 4];
1432    let mut fm = [[0.0f64; 4]; 1];
1433    for ib in 0..n_bra {
1434        let bras = [
1435            &bra_pairs[0][ib],
1436            &bra_pairs[1][ib],
1437            &bra_pairs[2][ib],
1438            &bra_pairs[3][ib],
1439        ];
1440        let mut bc = [0.0f64; 4];
1441        for (lane, bra) in bras.iter().enumerate() {
1442            bc[lane] = bra.c1 * bra.c2;
1443        }
1444        for ik in 0..n_ket {
1445            let kets = [
1446                &ket_pairs[0][ik],
1447                &ket_pairs[1][ik],
1448                &ket_pairs[2][ik],
1449                &ket_pairs[3][ik],
1450            ];
1451            let mut t = [0.0f64; 4];
1452            let mut pref = [0.0f64; 4];
1453            for (lane, (bra, ket)) in bras.iter().zip(kets.iter()).enumerate() {
1454                let p = bra.zeta;
1455                let q = ket.zeta;
1456                let pq = p + q;
1457                t[lane] = (p * q / pq) * dist2(bra.center, ket.center);
1458                pref[lane] = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
1459            }
1460            boys_array4(0, t, &mut fm);
1461            for (lane, ket) in kets.iter().enumerate() {
1462                acc[lane] += (bc[lane] * ket.c1) * ket.c2 * (pref[lane] * fm[0][lane]);
1463            }
1464        }
1465    }
1466    for (lane, out) in outs.iter_mut().enumerate() {
1467        out[0] += acc[lane];
1468    }
1469}
1470
1471/// Number of primitive pairs of two shells that survive the
1472/// [`PAIR_NEGLIGIBLE`] screen — the pair count [`build_pairs`] would produce.
1473/// Drivers bucketing shell quartets for [`coulomb_shell_batch4_into_scratch`]
1474/// use this (once per shell pair) to group quartets whose lanes run the
1475/// primitive loop in true lockstep.
1476#[must_use]
1477pub fn surviving_pair_count(s1: ShellRef<'_>, s2: ShellRef<'_>) -> usize {
1478    let d2 = dist2(s1.center, s2.center);
1479    let mut n = 0;
1480    for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
1481        for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
1482            let zeta = e1 + e2;
1483            let kappa = (-(e1 * e2 / zeta) * d2).exp();
1484            if kappa * (c1 * c2).abs() >= PAIR_NEGLIGIBLE {
1485                n += 1;
1486            }
1487        }
1488    }
1489    n
1490}
1491
1492/// Reusable buffers for the 4-quartet batch entry
1493/// [`coulomb_shell_batch4_into_scratch`]: per-lane pair lists and `c_ef`
1494/// accumulators, plus a scalar [`EriScratch`] for the per-lane HRR (and the
1495/// defensive scalar fallback).
1496#[derive(Debug, Default)]
1497pub struct EriBatch4Scratch {
1498    bra_pairs: [Vec<PrimPair>; 4],
1499    ket_pairs: [Vec<PrimPair>; 4],
1500    c_ef: [Vec<f64>; 4],
1501    scalar: EriScratch,
1502}
1503
1504/// Evaluate **four shell quartets in lockstep** — `quartets[lane] = [a, b, c, d]`
1505/// — accumulating each lane's contracted Cartesian block into `outs[lane]`
1506/// (same layout/contract as [`coulomb_shell_into`]).
1507///
1508/// All four quartets must share the VRR shape `(ne, nf) = (la+lb, lc+ld)` with
1509/// `ne, nf ≤ 3`, and have equal surviving bra and ket pair
1510/// counts (see [`surviving_pair_count`]) so the lanes run the primitive loop in
1511/// true lockstep. Quartets violating that (which a class-bucketing driver never
1512/// sends) are evaluated through the scalar path instead — same values either
1513/// way: each lane's result is **bit-identical** to a [`coulomb_shell_into`]
1514/// call for that quartet.
1515///
1516/// # Panics
1517/// Panics (in debug builds) if any `l > MAX_L` or any `outs[lane]` is too short.
1518pub fn coulomb_shell_batch4_into_scratch(
1519    scratch: &mut EriBatch4Scratch,
1520    quartets: &[[ShellRef<'_>; 4]; 4],
1521    outs: &mut [&mut [f64]; 4],
1522) {
1523    // Build the per-lane pair lists into the reused buffers (moved out for the
1524    // duration of the core call, like the scalar entry), then run the shared
1525    // core on borrowed slices.
1526    let mut bra_pairs = std::mem::take(&mut scratch.bra_pairs);
1527    let mut ket_pairs = std::mem::take(&mut scratch.ket_pairs);
1528    for (lane, &[a, b, c, d]) in quartets.iter().enumerate() {
1529        build_pairs(&mut bra_pairs[lane], a, b);
1530        build_pairs(&mut ket_pairs[lane], c, d);
1531    }
1532    let bra = [
1533        &bra_pairs[0][..],
1534        &bra_pairs[1][..],
1535        &bra_pairs[2][..],
1536        &bra_pairs[3][..],
1537    ];
1538    let ket = [
1539        &ket_pairs[0][..],
1540        &ket_pairs[1][..],
1541        &ket_pairs[2][..],
1542        &ket_pairs[3][..],
1543    ];
1544    coulomb_shell_batch4_core(scratch, quartets, bra, ket, outs);
1545    scratch.bra_pairs = bra_pairs;
1546    scratch.ket_pairs = ket_pairs;
1547}
1548
1549/// Like [`coulomb_shell_batch4_into_scratch`] but with each lane's bra/ket
1550/// primitive-pair data supplied by the caller (precomputed once per shell pair
1551/// across a dense build, see [`ShellPairData`]). Bit-identical to the
1552/// self-building entry: same pair values and order, only computed elsewhere.
1553///
1554/// # Panics
1555/// Panics (in debug builds) if any `l > MAX_L` or any `outs[lane]` is too short.
1556pub fn coulomb_shell_batch4_pairs_into_scratch(
1557    scratch: &mut EriBatch4Scratch,
1558    quartets: &[[ShellRef<'_>; 4]; 4],
1559    bra_pairs: [&ShellPairData; 4],
1560    ket_pairs: [&ShellPairData; 4],
1561    outs: &mut [&mut [f64]; 4],
1562) {
1563    coulomb_shell_batch4_core(
1564        scratch,
1565        quartets,
1566        bra_pairs.map(|p| &p.pairs[..]),
1567        ket_pairs.map(|p| &p.pairs[..]),
1568        outs,
1569    );
1570}
1571
1572/// The 4-lane batch kernel shared by the self-building and borrowed-pairs
1573/// entries. Lanes that violate the lockstep contract (mismatched VRR shape,
1574/// `ne`/`nf > 3`, or unequal pair counts — which a class-bucketing driver never
1575/// sends) drain through the scalar core with the same pair lists.
1576fn coulomb_shell_batch4_core(
1577    scratch: &mut EriBatch4Scratch,
1578    quartets: &[[ShellRef<'_>; 4]; 4],
1579    bra: [&[PrimPair]; 4],
1580    ket: [&[PrimPair]; 4],
1581    outs: &mut [&mut [f64]; 4],
1582) {
1583    let [a0, b0, c0, d0] = quartets[0];
1584    let ne = a0.l + b0.l;
1585    let nf = c0.l + d0.l;
1586    let lanes_match = quartets
1587        .iter()
1588        .all(|[a, b, c, d]| a.l + b.l == ne && c.l + d.l == nf);
1589    let n_bra = bra[0].len();
1590    let n_ket = ket[0].len();
1591    let lockstep = lanes_match
1592        && ne <= 3
1593        && nf <= 3
1594        && bra.iter().all(|p| p.len() == n_bra)
1595        && ket.iter().all(|p| p.len() == n_ket);
1596    if !lockstep {
1597        for (lane, &[a, b, c, d]) in quartets.iter().enumerate() {
1598            coulomb_shell_core(
1599                &mut scratch.scalar,
1600                a,
1601                b,
1602                c,
1603                d,
1604                bra[lane],
1605                ket[lane],
1606                outs[lane],
1607            );
1608        }
1609        return;
1610    }
1611
1612    // (ss|ss): the 4-lane fast path (no VRR/HRR scaffolding, like the scalar
1613    // fast path).
1614    if ne + nf == 0 {
1615        contract_ssss4(bra, ket, outs);
1616        return;
1617    }
1618
1619    let n_e = n_addr(ne);
1620    let n_f = n_addr(nf);
1621    for lane in 0..4 {
1622        ensure_len(&mut scratch.c_ef[lane], n_e * n_f);
1623        scratch.c_ef[lane][..n_e * n_f].fill(0.0);
1624    }
1625    {
1626        let [c0, c1, c2, c3] = &mut scratch.c_ef;
1627        let mut c_ef: [&mut [f64]; 4] = [
1628            &mut c0[..n_e * n_f],
1629            &mut c1[..n_e * n_f],
1630            &mut c2[..n_e * n_f],
1631            &mut c3[..n_e * n_f],
1632        ];
1633        match (ne, nf) {
1634            (0, 1) => contract_class4::<0, 1>(bra, ket, &mut c_ef),
1635            (1, 0) => contract_class4::<1, 0>(bra, ket, &mut c_ef),
1636            (1, 1) => contract_class4::<1, 1>(bra, ket, &mut c_ef),
1637            (0, 2) => contract_class4::<0, 2>(bra, ket, &mut c_ef),
1638            (2, 0) => contract_class4::<2, 0>(bra, ket, &mut c_ef),
1639            (1, 2) => contract_class4::<1, 2>(bra, ket, &mut c_ef),
1640            (2, 1) => contract_class4::<2, 1>(bra, ket, &mut c_ef),
1641            (2, 2) => contract_class4::<2, 2>(bra, ket, &mut c_ef),
1642            (0, 3) => contract_class4::<0, 3>(bra, ket, &mut c_ef),
1643            (3, 0) => contract_class4::<3, 0>(bra, ket, &mut c_ef),
1644            (1, 3) => contract_class4::<1, 3>(bra, ket, &mut c_ef),
1645            (3, 1) => contract_class4::<3, 1>(bra, ket, &mut c_ef),
1646            (2, 3) => contract_class4::<2, 3>(bra, ket, &mut c_ef),
1647            (3, 2) => contract_class4::<3, 2>(bra, ket, &mut c_ef),
1648            (3, 3) => contract_class4::<3, 3>(bra, ket, &mut c_ef),
1649            _ => unreachable!("guarded above"),
1650        }
1651    }
1652
1653    // Per-lane HRR + scatter, exactly the scalar tail of
1654    // [`coulomb_shell_into_scratch`].
1655    let EriScratch {
1656        bra,
1657        bra_prev,
1658        bra_cur,
1659        ket_prev,
1660        ket_cur,
1661        tri_all,
1662        ..
1663    } = &mut scratch.scalar;
1664    if tri_all.len() <= 2 * MAX_L {
1665        *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
1666    }
1667    for (lane, &[a, b, c, d]) in quartets.iter().enumerate() {
1668        let ab = sub(a.center, b.center);
1669        let cd = sub(c.center, d.center);
1670        hrr_and_scatter(
1671            a.l,
1672            b.l,
1673            c.l,
1674            d.l,
1675            n_f,
1676            &scratch.c_ef[lane],
1677            ab,
1678            cd,
1679            outs[lane],
1680            bra,
1681            bra_prev,
1682            bra_cur,
1683            ket_prev,
1684            ket_cur,
1685            &tri_all[..],
1686        );
1687    }
1688}
1689
1690/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
1691/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
1692///
1693/// Algebraically identical to the former full-table VRR (same OS recurrences, same
1694/// term order) — only the storage changes, so it is bit-identical
1695/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
1696/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
1697/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
1698/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
1699/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
1700/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
1701/// buffer is recycled.
1702#[allow(clippy::too_many_arguments)]
1703fn vrr_primitive(
1704    p: f64,
1705    q: f64,
1706    pc: Vec3,
1707    qc: Vec3,
1708    kab: f64,
1709    kcd: f64,
1710    pa: Vec3,    // P − A  (bra pair, precomputed)
1711    qcen: Vec3,  // Q − C  (ket pair, precomputed)
1712    inv_2p: f64, // 1/(2p) (bra pair, precomputed)
1713    inv_2q: f64, // 1/(2q) (ket pair, precomputed)
1714    ne: usize,
1715    nf: usize,
1716    l_total: usize,
1717    n_e: usize,
1718    n_f: usize,
1719    maxlevel: usize,
1720    eoff: &[usize],
1721    slab: &[usize],
1722    tri: &[Vec<[usize; 3]>],
1723    levels: &mut [f64],
1724    scale: f64,
1725    c_ef: &mut [f64],
1726) {
1727    let pq = p + q;
1728    let rho = p * q / pq;
1729    let pq_vec = sub(pc, qc); // P − Q
1730    let t = rho * norm2(pq_vec);
1731    let w = [
1732        (p * pc[0] + q * qc[0]) / pq,
1733        (p * pc[1] + q * qc[1]) / pq,
1734        (p * pc[2] + q * qc[2]) / pq,
1735    ];
1736    // `pa` (P−A) and `qcen` (Q−C) are now passed in (pure per-pair).
1737    let wp = sub(w, pc); // W − P
1738    let wq = sub(w, qc); // W − Q
1739
1740    // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
1741    // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
1742    // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
1743    // source and destination across the distinct slots `k`, `k−1`, `k−2`.
1744    let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
1745        (k % 3) * maxlevel + lf * slab[k] + eoff[k * n_e + ae] + m
1746    };
1747
1748    // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
1749    // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
1750    // bound; an ERI couples both electrons).
1751    use std::f64::consts::PI;
1752    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
1753    let mut fm = [0.0f64; 4 * MAX_L + 1];
1754    boys_array(l_total, t, &mut fm[..=l_total]);
1755    for m in 0..=l_total {
1756        levels[off(0, 0, 0, m)] = pref * fm[m];
1757    }
1758
1759    // `inv_2p` / `inv_2q` are now passed in (pure per-pair).
1760    let inv_2pq = 0.5 / pq;
1761    let q_over_pq = q / pq;
1762    let p_over_pq = p / pq;
1763
1764    // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
1765    // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
1766    // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
1767    let eoff0 = &eoff[..n_e];
1768    for (na, te_list) in tri.iter().enumerate().take(ne + 1).skip(1) {
1769        for &te in te_list {
1770            let i = lower_axis(te);
1771            let s1a = addr(dec(te, i));
1772            let mmax = l_total - na;
1773            let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
1774            let has2 = te[i] >= 2;
1775            let s1_0 = eoff0[s1a];
1776            let s2_0 = if has2 {
1777                eoff0[addr(dec(dec(te, i), i))]
1778            } else {
1779                0
1780            };
1781            let dst_0 = eoff0[addr(te)];
1782            for m in 0..=mmax {
1783                let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
1784                if has2 {
1785                    v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
1786                }
1787                levels[dst_0 + m] = v;
1788            }
1789        }
1790    }
1791    // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
1792    for ae in 0..n_e {
1793        c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
1794    }
1795
1796    // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
1797    // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
1798    // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
1799    // the former full-table inner-loop cost. The k−2 level is read only when the axis
1800    // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
1801    for k in 1..=nf {
1802        let slot_k = (k % 3) * maxlevel;
1803        let slot_k1 = ((k - 1) % 3) * maxlevel;
1804        let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
1805        let eoff_k = &eoff[k * n_e..];
1806        let eoff_k1 = &eoff[(k - 1) * n_e..];
1807        let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
1808            (
1809                ((k - 2) % 3) * maxlevel,
1810                slab[k - 2],
1811                &eoff[(k - 2) * n_e..],
1812            )
1813        } else {
1814            (0, 0, &[])
1815        };
1816        for &tf in &tri[k] {
1817            let j = lower_axis(tf);
1818            let f1 = dec(tf, j);
1819            let lf1 = cart_index(f1); // local index in level k−1
1820            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
1821            let has2 = tf[j] >= 2;
1822            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
1823            let lf = cart_index(tf); // local index in level k
1824            let sk = slot_k + lf * slab_k;
1825            let sk1 = slot_k1 + lf1 * slab_k1;
1826            let sk2 = slot_k2 + lf2 * slab_k2;
1827            for (nadeg, te_list) in tri.iter().enumerate().take(ne + 1) {
1828                for &te in te_list {
1829                    let ea = addr(te);
1830                    // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
1831                    let has_cross = te[j] >= 1;
1832                    let (cross_coef, cross_0) = if has_cross {
1833                        (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
1834                    } else {
1835                        (0.0, 0)
1836                    };
1837                    let mmax = l_total - nadeg - k;
1838                    let dst_0 = sk + eoff_k[ea];
1839                    let src1_0 = sk1 + eoff_k1[ea];
1840                    let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
1841                    for m in 0..=mmax {
1842                        let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
1843                        if has2 {
1844                            v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
1845                        }
1846                        if has_cross {
1847                            v += cross_coef * levels[cross_0 + m + 1];
1848                        }
1849                        levels[dst_0 + m] = v;
1850                    }
1851                }
1852            }
1853        }
1854        // Extract level k: contract each f-component's m = 0 slice into c_ef.
1855        for &tf in &tri[k] {
1856            let lf = cart_index(tf);
1857            let af = addr(tf);
1858            for ae in 0..n_e {
1859                c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
1860            }
1861        }
1862    }
1863}
1864
1865/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
1866/// the row-major output block.
1867///
1868/// Flat-array HGP horizontal recurrence, replacing the earlier
1869/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
1870/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
1871/// term order — so it is **bit-identical** to the recursive version (guarded by
1872/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
1873/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
1874///
1875/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
1876/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
1877/// independently per ket index `f` (a spectator), into the `bra` intermediate.
1878/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
1879/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
1880/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
1881/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
1882#[allow(clippy::too_many_arguments)]
1883fn hrr_and_scatter<'a>(
1884    la: usize,
1885    lb: usize,
1886    lc: usize,
1887    ld: usize,
1888    n_f: usize,
1889    c_ef: &[f64],
1890    ab: Vec3,
1891    cd: Vec3,
1892    out: &mut [f64],
1893    bra: &mut Vec<f64>,
1894    mut prev: &'a mut Vec<f64>,
1895    mut cur: &'a mut Vec<f64>,
1896    mut kprev: &'a mut Vec<f64>,
1897    mut kcur: &'a mut Vec<f64>,
1898    // Shared Cartesian-triple table (degrees `0..=2·MAX_L`); HRR indexes it up to
1899    // `max(ne, nf)`. Passed in so it is not re-`collect`ed per quartet.
1900    tri: &[Vec<[usize; 3]>],
1901) {
1902    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
1903    let ne = la + lb; // max bra (A-side) degree present in c_ef
1904    let nf = lc + ld; // max ket (C-side) degree present in c_ef
1905    let n_e = n_addr(ne);
1906
1907    // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
1908    // the ket base are keyed by the global `addr(f)` offset by `f_base`.
1909    let f_base = tri_below(lc);
1910    let nf_range = n_f - f_base;
1911
1912    // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
1913    // Reused arena buffers; bra is fully overwritten below, the rolling
1914    // layers in the region read — debug NaN-fill bra so a missed write surfaces.
1915    let bra_len = na * nb * nf_range;
1916    ensure_len(bra, bra_len);
1917    #[cfg(debug_assertions)]
1918    bra[..bra_len].fill(f64::NAN);
1919    // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
1920    let layer_len = n_cart(lb) * n_e;
1921    ensure_len(prev, layer_len);
1922    ensure_len(cur, layer_len);
1923    for f_global in f_base..n_f {
1924        let jf = f_global - f_base;
1925        // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
1926        for &a in tri[la..=ne].iter().flatten() {
1927            let ae = addr(a);
1928            prev[ae] = c_ef[ae * n_f + f_global];
1929        }
1930        for kb in 1..=lb {
1931            for (ibw, &b) in tri[kb].iter().enumerate() {
1932                let i = lower_axis(b);
1933                let b1w = cart_index(dec(b, i));
1934                // a-degrees that can still reach (la, lb): [la..=ne−kb].
1935                for &a in tri[la..=(ne - kb)].iter().flatten() {
1936                    let ae = addr(a);
1937                    let a1e = addr(inc(a, i));
1938                    cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
1939                }
1940            }
1941            std::mem::swap(&mut prev, &mut cur);
1942        }
1943        // `prev` now holds b-degree lb (or the base when lb = 0): extract.
1944        for (ib, &b) in tri[lb].iter().enumerate() {
1945            let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
1946            for (ia, &a) in tri[la].iter().enumerate() {
1947                bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
1948            }
1949        }
1950    }
1951
1952    // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
1953    let klayer_len = n_cart(ld) * n_f;
1954    ensure_len(kprev, klayer_len);
1955    ensure_len(kcur, klayer_len);
1956    for ia in 0..na {
1957        for ib in 0..nb {
1958            let brarow = (ia * nb + ib) * nf_range;
1959            // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
1960            for &c in tri[lc..=nf].iter().flatten() {
1961                let ce = addr(c);
1962                kprev[ce] = bra[brarow + (ce - f_base)];
1963            }
1964            for kd in 1..=ld {
1965                for (idw, &d) in tri[kd].iter().enumerate() {
1966                    let j = lower_axis(d);
1967                    let d1w = cart_index(dec(d, j));
1968                    for &c in tri[lc..=(nf - kd)].iter().flatten() {
1969                        let ce = addr(c);
1970                        let c1e = addr(inc(c, j));
1971                        kcur[idw * n_f + ce] =
1972                            kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
1973                    }
1974                }
1975                std::mem::swap(&mut kprev, &mut kcur);
1976            }
1977            // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
1978            for (ic, &c) in tri[lc].iter().enumerate() {
1979                let ce = addr(c);
1980                for id in 0..nd {
1981                    out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
1982                }
1983            }
1984        }
1985    }
1986}
1987
1988// --- small helpers ---
1989
1990/// First nonzero axis of a triple (the lowering direction).
1991#[inline]
1992fn lower_axis(t: [usize; 3]) -> usize {
1993    if t[0] > 0 {
1994        0
1995    } else if t[1] > 0 {
1996        1
1997    } else {
1998        2
1999    }
2000}
2001
2002#[inline]
2003fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
2004    t[i] -= 1;
2005    t
2006}
2007
2008#[inline]
2009fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
2010    t[i] += 1;
2011    t
2012}
2013
2014#[inline]
2015fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
2016    [
2017        (a * ca[0] + b * cb[0]) / p,
2018        (a * ca[1] + b * cb[1]) / p,
2019        (a * ca[2] + b * cb[2]) / p,
2020    ]
2021}
2022
2023#[inline]
2024fn sub(u: Vec3, v: Vec3) -> Vec3 {
2025    [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
2026}
2027
2028#[inline]
2029fn dist2(u: Vec3, v: Vec3) -> f64 {
2030    norm2(sub(u, v))
2031}
2032
2033#[inline]
2034fn norm2(u: Vec3) -> f64 {
2035    u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
2036}
2037
2038#[cfg(test)]
2039mod tests {
2040    use super::*;
2041
2042    /// Single-primitive `ShellRef` helper.
2043    fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
2044        (center, l, [exp], [1.0])
2045    }
2046
2047    /// (ss|ss) over four unit s primitives must equal the closed form
2048    /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
2049    /// engine passes, so the two share a verified base case.
2050    #[test]
2051    fn ssss_matches_closed_form() {
2052        let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
2053        let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
2054        let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
2055        let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
2056        let mut out = [0.0; 1];
2057        coulomb_shell_into(
2058            ShellRef {
2059                center: ac,
2060                l: al,
2061                exps: &ae,
2062                coeffs: &acf,
2063            },
2064            ShellRef {
2065                center: bc,
2066                l: bl,
2067                exps: &be,
2068                coeffs: &bcf,
2069            },
2070            ShellRef {
2071                center: cc,
2072                l: cl,
2073                exps: &ce,
2074                coeffs: &ccf,
2075            },
2076            ShellRef {
2077                center: dc,
2078                l: dl,
2079                exps: &de,
2080                coeffs: &dcf,
2081            },
2082            &mut out,
2083        );
2084
2085        let p = 0.8 + 1.3;
2086        let q = 0.5 + 1.1;
2087        let pcen = combine(0.8, ac, 1.3, bc, p);
2088        let qcen = combine(0.5, cc, 1.1, dc, q);
2089        let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
2090        let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
2091        let rho = p * q / (p + q);
2092        let t = rho * dist2(pcen, qcen);
2093        let mut fm = [0.0; 1];
2094        boys_array(0, t, &mut fm);
2095        use std::f64::consts::PI;
2096        let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
2097        assert!(
2098            (out[0] - expect).abs() < 1e-14 * expect.abs(),
2099            "ssss {} vs {}",
2100            out[0],
2101            expect
2102        );
2103    }
2104
2105    use crate::os::Prim;
2106    use crate::rys::coulomb_into;
2107
2108    /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
2109    /// center)`, for cross-checking against the Rys engine.
2110    #[allow(clippy::too_many_arguments)]
2111    fn os_block(
2112        la: usize,
2113        ea: f64,
2114        ca: Vec3,
2115        lb: usize,
2116        eb: f64,
2117        cb: Vec3,
2118        lc: usize,
2119        recc: f64,
2120        ccc: Vec3,
2121        ld: usize,
2122        ed: f64,
2123        cdd: Vec3,
2124    ) -> Vec<f64> {
2125        let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
2126        let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
2127        let one = [1.0];
2128        coulomb_shell_into(
2129            ShellRef {
2130                center: ca,
2131                l: la,
2132                exps: &ea1,
2133                coeffs: &one,
2134            },
2135            ShellRef {
2136                center: cb,
2137                l: lb,
2138                exps: &eb1,
2139                coeffs: &one,
2140            },
2141            ShellRef {
2142                center: ccc,
2143                l: lc,
2144                exps: &ec1,
2145                coeffs: &one,
2146            },
2147            ShellRef {
2148                center: cdd,
2149                l: ld,
2150                exps: &ed1,
2151                coeffs: &one,
2152            },
2153            &mut out,
2154        );
2155        out
2156    }
2157
2158    /// OS/HGP must reproduce the Rys engine element-for-element on a single
2159    /// primitive quartet, across a sweep of angular momenta (including the
2160    /// bug-prone mixed-high-L and "all four different L on four centers" cases).
2161    #[test]
2162    fn matches_rys_engine_primitive_sweep() {
2163        let ca = [0.0, 0.0, 0.0];
2164        let cb = [0.5, -0.3, 0.2];
2165        let cc = [-0.4, 0.6, -0.1];
2166        let cd = [0.2, 0.4, 0.8];
2167        let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);
2168
2169        let quartets = [
2170            (0usize, 0usize, 0usize, 0usize),
2171            (1, 0, 0, 0),
2172            (0, 0, 1, 0),
2173            (1, 1, 0, 0),
2174            (1, 0, 1, 0),
2175            (1, 1, 1, 1),
2176            (2, 0, 0, 0),
2177            (2, 1, 0, 0),
2178            (2, 1, 2, 1),
2179            (0, 1, 2, 3), // four different L on four centers
2180            (2, 2, 3, 3), // (dd|ff) mixed high-L
2181            (3, 0, 0, 1),
2182            // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
2183            // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
2184            // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
2185            (4, 4, 4, 1), // l_total = 13
2186            (6, 6, 1, 0), // l_total = 13, two i-shells
2187            (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
2188        ];
2189        for (la, lb, lc, ld) in quartets {
2190            let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
2191            let mut rys = vec![0.0; os.len()];
2192            coulomb_into(
2193                Prim::new(ea, ca, la),
2194                Prim::new(eb, cb, lb),
2195                Prim::new(ec, cc, lc),
2196                Prim::new(ed, cd, ld),
2197                1.0,
2198                &mut rys,
2199            );
2200            assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
2201        }
2202    }
2203
2204    /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
2205    /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
2206    /// The atol floor absorbs benign near-cancellation on structurally tiny
2207    /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
2208    /// even though both engines are correct — their dominant elements agree to
2209    /// ~1e-12); the rtol catches any real divergence.
2210    fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
2211        const ATOL: f64 = 1e-11;
2212        const RTOL: f64 = 1e-10;
2213        for (o, r) in os.iter().zip(rys.iter()) {
2214            assert!(
2215                (o - r).abs() <= ATOL + RTOL * r.abs(),
2216                "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
2217                (o - r).abs()
2218            );
2219        }
2220    }
2221
2222    /// HGP early contraction (VRR per primitive, HRR once in contracted space)
2223    /// must equal the per-primitive Rys sum for a genuinely **contracted**
2224    /// quartet.
2225    ///
2226    /// Note: doing the HRR after contraction is a *performance*
2227    /// choice, **not** a correctness fork. The HRR operator is linear with
2228    /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
2229    /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
2230    /// and post-contraction HRR give the *same* answer. This remains a valuable
2231    /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
2232    /// coefficient, or a wrong HRR sign would all fail it), but it does not
2233    /// distinguish HRR *ordering*, because the two orders are algebraically equal.
2234    #[test]
2235    fn contracted_quartet_matches_rys_sum() {
2236        // Two p shells and a d shell, each with multiple primitives, on distinct
2237        // centres (so HRR shift vectors A−B, C−D are all non-trivial).
2238        let ca = [0.0, 0.0, 0.0];
2239        let cb = [0.6, -0.2, 0.1];
2240        let cc = [-0.3, 0.5, -0.2];
2241        let cd = [0.2, 0.3, 0.7];
2242        let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
2243        let ax = [1.4, 0.45];
2244        let acf = [0.6, 0.5];
2245        let bx = [0.9, 0.3];
2246        let bcf = [0.55, 0.5];
2247        let cx = [1.1, 0.4];
2248        let ccf = [0.7, 0.4];
2249        let dx = [0.8];
2250        let dcf = [1.0];
2251
2252        let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
2253        coulomb_shell_into(
2254            ShellRef {
2255                center: ca,
2256                l: la,
2257                exps: &ax,
2258                coeffs: &acf,
2259            },
2260            ShellRef {
2261                center: cb,
2262                l: lb,
2263                exps: &bx,
2264                coeffs: &bcf,
2265            },
2266            ShellRef {
2267                center: cc,
2268                l: lc,
2269                exps: &cx,
2270                coeffs: &ccf,
2271            },
2272            ShellRef {
2273                center: cd,
2274                l: ld,
2275                exps: &dx,
2276                coeffs: &dcf,
2277            },
2278            &mut os,
2279        );
2280
2281        // Reference: sum the Rys primitive engine over the quartet with the same
2282        // effective coefficients.
2283        let mut rys = vec![0.0; os.len()];
2284        for (&ea, &wa) in ax.iter().zip(acf.iter()) {
2285            for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
2286                for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
2287                    for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
2288                        coulomb_into(
2289                            Prim::new(ea, ca, la),
2290                            Prim::new(eb, cb, lb),
2291                            Prim::new(ec, cc, lc),
2292                            Prim::new(ed, cd, ld),
2293                            wa * wb * wc * wd,
2294                            &mut rys,
2295                        );
2296                    }
2297                }
2298            }
2299        }
2300
2301        for (o, r) in os.iter().zip(rys.iter()) {
2302            assert!(
2303                (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
2304                "contracted OS vs Rys mismatch: {o} vs {r}"
2305            );
2306        }
2307    }
2308
2309    /// The triple-address map must be a bijection onto `0..n_addr(L)`.
2310    #[test]
2311    fn addr_is_bijective() {
2312        for lmax in 0..=6 {
2313            let mut seen = vec![false; n_addr(lmax)];
2314            for n in 0..=lmax {
2315                for t in cart_components(n) {
2316                    let a = addr(t);
2317                    assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
2318                    assert!(!seen[a], "addr collision at {t:?}");
2319                    seen[a] = true;
2320                }
2321            }
2322            assert!(
2323                seen.iter().all(|&x| x),
2324                "addr not surjective at lmax {lmax}"
2325            );
2326        }
2327    }
2328}