Skip to main content

integral_core/
os_eri.rs

1//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
2//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
3//!
4//! A **vertical recurrence (VRR)** builds the
5//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
6//! the primitives are **contracted** into AO space, and a **horizontal recurrence
7//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
8//! Doing the HRR after contraction is the HGP "early contraction" trick: the
9//! geometry-only HRR runs **once per shell quartet** instead of once per
10//! primitive quartet, which is the win at high contraction degree.
11//!
12//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
13//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
14//! against it (and against an independent McMurchie–Davidson path).
15//!
16//! ## Method
17//!
18//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
19//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
20//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
21//!
22//! ```text
23//!   [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
24//! ```
25//!
26//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
27//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
28//!
29//! ```text
30//!   [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
31//!       + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
32//!       + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
33//!   [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
34//!       + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
35//!       + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
36//! ```
37//!
38//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
39//! Cartesian shell block:
40//!
41//! ```text
42//!   (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0)   [bra, A→B]
43//!   (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d)   [ket, C→D]
44//! ```
45//!
46//! ## Buffers (see `DESIGN_NOTES.md` D3, D12)
47//!
48//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
49//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
50//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
51//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
52//! `(ii|ii)`. The current engine (D12) removes that:
53//!
54//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
55//!   `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
56//!   of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
57//!   index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
58//!   `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
59//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
60//!   / [`cart_index`] with two rolling degree layers, replacing the former HashMap
61//!   memoisation.
62//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
63//!   across quartets (thread-local by default), not re-allocated per quartet.
64//!
65//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
66//! former full-table engine's values, cross-checked against the Rys engine and an
67//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).
68
69use std::cell::RefCell;
70
71use integral_math::am::{cart_components, cart_index, n_cart};
72use integral_math::boys::boys_array;
73
74use crate::os::{Vec3, MAX_L};
75
76/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
77/// momentum, and primitive `(exponent, effective-coefficient)` data. The
78/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
79/// engine itself works on un-normalised monomials and only multiplies the four
80/// coefficients into the contracted accumulator.
81#[derive(Debug, Clone, Copy)]
82pub struct ShellRef<'a> {
83    /// Shell centre (bohr).
84    pub center: Vec3,
85    /// Angular momentum.
86    pub l: usize,
87    /// Primitive exponents.
88    pub exps: &'a [f64],
89    /// Effective contraction coefficients, aligned with `exps`.
90    pub coeffs: &'a [f64],
91}
92
93/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
94#[inline]
95fn n_addr(lmax: usize) -> usize {
96    (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
97}
98
99/// Number of Cartesian triples of degree **strictly less than** `d`:
100/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
101/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
102#[inline]
103fn tri_below(d: usize) -> usize {
104    d * (d + 1) * (d + 2) / 6
105}
106
107/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
108/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
109/// `0..n_addr(deg)`.
110#[inline]
111fn addr(t: [usize; 3]) -> usize {
112    let n = t[0] + t[1] + t[2];
113    let base = n * (n + 1) * (n + 2) / 6;
114    // within-degree index: (n-lx)(n-lx+1)/2 + lz
115    let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
116    base + within
117}
118
119/// Reusable scratch arena for the OS/HGP ERI engine.
120///
121/// Holds the engine's working buffers so they are allocated **once and reused**
122/// across shell quartets, instead of a fresh heap allocation per quartet inside the
123/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
124/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
125/// thread uses its own instance — the thread-local default behind
126/// [`coulomb_shell_into`], or one passed explicitly to
127/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
128/// error).
129///
130/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
131/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
132/// they are later read, so they need no functional zeroing — but in debug builds
133/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
134/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
135/// rather than silently reading a stale value from a previous quartet. Results are
136/// therefore independent of evaluation order and of arena reuse (asserted by
137/// `tests/arena.rs`).
138#[derive(Debug, Default, Clone)]
139pub struct EriScratch {
140    /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
141    levels: Vec<f64>,
142    /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
143    c_ef: Vec<f64>,
144    /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
145    bra: Vec<f64>,
146    /// HRR bra rolling `b`-degree layers.
147    bra_prev: Vec<f64>,
148    bra_cur: Vec<f64>,
149    /// HRR ket rolling `d`-degree layers.
150    ket_prev: Vec<f64>,
151    ket_cur: Vec<f64>,
152    /// Precomputed bra/ket primitive-pair data (combined exponent, product
153    /// centre, `K` prefactor, coeff product) — built once per shell quartet so
154    /// the per-pair `exp`/centre work is not repeated inside the deep primitive
155    /// loop. Reused across quartets like the other buffers.
156    bra_pairs: Vec<PrimPair>,
157    ket_pairs: Vec<PrimPair>,
158    /// Shared Cartesian-triple table `tri_all[d] = cart_components(d)` for every
159    /// degree the engine needs (`0..=2·MAX_L`). Built once and sliced for the VRR
160    /// `e`/`f` lists and the HRR triples, instead of re-`collect`ing per quartet.
161    tri_all: Vec<Vec<[usize; 3]>>,
162    /// Flat VRR offset table: `eoff[k·n_e + ae]` is `e`'s packed offset in level
163    /// `k`. Rebuilt per quartet into this reused buffer (no per-quartet `Vec<Vec>`
164    /// allocation); `slab[k]` is one `f`-component's packed size for level `k`.
165    eoff: Vec<usize>,
166    slab: Vec<usize>,
167}
168
169/// One primitive pair's reusable geometry/prefactor, shared by every primitive
170/// quartet that uses it: the combined exponent `zeta = α+β`, the Gaussian product
171/// centre, the overlap prefactor `K = exp(−αβ/ζ·|A−B|²)`, and the two contraction
172/// coefficients (`c1` outer, `c2` inner).
173///
174/// The coefficients are kept **un-multiplied** so the contracted `scale` can be
175/// formed in exactly the former evaluation order `((c_a·c_b)·c_c)·c_d` — making the
176/// engine's output bit-identical to the pre-precomputation code (the per-element
177/// and 8-fold-symmetry tests are floorless-relative and sensitive to the last ULP
178/// on near-cancellation elements).
179#[derive(Debug, Clone, Copy, Default)]
180struct PrimPair {
181    zeta: f64,
182    center: Vec3,
183    kappa: f64,
184    c1: f64,
185    c2: f64,
186    /// `1/(2ζ)` — the VRR `inv_2p`(bra)/`inv_2q`(ket) coefficient. Pure per-pair.
187    inv_2zeta: f64,
188    /// `centre − s1.centre` — the VRR `P−A`(bra)/`Q−C`(ket) shift. Pure per-pair.
189    r1: Vec3,
190}
191
192/// Fill `out` with the [`PrimPair`] data for every primitive combination of two
193/// shells (outer `s1`, inner `s2`), matching the former in-loop computation
194/// element-for-element. The inter-centre distance is hoisted out of the pair loop.
195/// `inv_2zeta` and `r1` are the pair-only VRR coefficients (`1/2ζ` and the `P−A` /
196/// `Q−C` shift) that the kernel formerly recomputed on every primitive quartet.
197fn build_pairs(out: &mut Vec<PrimPair>, s1: ShellRef<'_>, s2: ShellRef<'_>) {
198    out.clear();
199    out.reserve(s1.exps.len() * s2.exps.len());
200    let d2 = dist2(s1.center, s2.center);
201    for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
202        for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
203            let zeta = e1 + e2;
204            let center = combine(e1, s1.center, e2, s2.center, zeta);
205            out.push(PrimPair {
206                zeta,
207                center,
208                kappa: (-(e1 * e2 / zeta) * d2).exp(),
209                c1,
210                c2,
211                inv_2zeta: 0.5 / zeta,
212                r1: sub(center, s1.center),
213            });
214        }
215    }
216}
217
218impl EriScratch {
219    /// A fresh, empty arena; it grows to fit on first use.
220    #[must_use]
221    pub fn new() -> Self {
222        Self::default()
223    }
224
225    /// Total `f64` elements currently held across all buffers — the resident
226    /// working set, for memory reporting/tests.
227    #[must_use]
228    pub fn resident_f64(&self) -> usize {
229        self.levels.len()
230            + self.c_ef.len()
231            + self.bra.len()
232            + self.bra_prev.len()
233            + self.bra_cur.len()
234            + self.ket_prev.len()
235            + self.ket_cur.len()
236    }
237
238    /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
239    /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
240    /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
241    #[must_use]
242    pub fn largest_buffer_f64(&self) -> usize {
243        [
244            self.levels.len(),
245            self.c_ef.len(),
246            self.bra.len(),
247            self.bra_prev.len(),
248            self.bra_cur.len(),
249            self.ket_prev.len(),
250            self.ket_cur.len(),
251        ]
252        .into_iter()
253        .max()
254        .unwrap_or(0)
255    }
256}
257
258thread_local! {
259    /// Per-thread default arena backing [`coulomb_shell_into`].
260    static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
261}
262
263/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
264#[inline]
265fn ensure_len(v: &mut Vec<f64>, n: usize) {
266    if v.len() < n {
267        v.resize(n, 0.0);
268    }
269}
270
271/// `ensure_len` for the `usize` offset/slab buffers.
272#[inline]
273fn ensure_usize(v: &mut Vec<usize>, n: usize) {
274    if v.len() < n {
275        v.resize(n, 0);
276    }
277}
278
279/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
280/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
281/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
282///
283/// The engine owns the primitive-quartet loop (the HGP contraction happens between
284/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
285/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
286/// explicit control (e.g. one arena per worker thread) call
287/// [`coulomb_shell_into_scratch`].
288///
289/// # Panics
290/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
291pub fn coulomb_shell_into(
292    a: ShellRef<'_>,
293    b: ShellRef<'_>,
294    c: ShellRef<'_>,
295    d: ShellRef<'_>,
296    out: &mut [f64],
297) {
298    ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
299}
300
301/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
302/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
303/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
304/// compile error); the result is bit-identical regardless of which arena is used or
305/// what it last held.
306///
307/// # Panics
308/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
309pub fn coulomb_shell_into_scratch(
310    scratch: &mut EriScratch,
311    a: ShellRef<'_>,
312    b: ShellRef<'_>,
313    c: ShellRef<'_>,
314    d: ShellRef<'_>,
315    out: &mut [f64],
316) {
317    let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
318    debug_assert!(
319        la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
320        "angular momentum exceeds MAX_L"
321    );
322    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
323    debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");
324
325    let ne = la + lb; // max bra (A-side) degree built by VRR
326    let nf = lc + ld; // max ket (C-side) degree built by VRR
327    let l_total = ne + nf;
328    let n_e = n_addr(ne);
329    let n_f = n_addr(nf);
330
331    let EriScratch {
332        levels,
333        c_ef,
334        bra,
335        bra_prev,
336        bra_cur,
337        ket_prev,
338        ket_cur,
339        bra_pairs,
340        ket_pairs,
341        tri_all,
342        eoff,
343        slab,
344    } = scratch;
345
346    // Fast path: (ss|ss). With no angular momentum to raise, the whole quartet is a
347    // single contracted [00|00]^(0). Skip the VRR/HRR machinery (the dead raise
348    // coefficients, the `levels` buffer round-trip, the per-primitive call) and sum
349    // straight into a register. Bit-identical to the general path: same `pref`/`t`
350    // expressions, same `((c_a·c_b)·c_c)·c_d` scale, same `scale·(pref·F_0)` term,
351    // same bra-outer/ket-inner order — just without the scaffolding.
352    if l_total == 0 {
353        build_pairs(bra_pairs, a, b);
354        build_pairs(ket_pairs, c, d);
355        let two_pi_2_5 =
356            2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
357        let mut acc = 0.0;
358        let mut f0 = [0.0f64; 1];
359        for bra in bra_pairs.iter() {
360            let bc = bra.c1 * bra.c2;
361            for ket in ket_pairs.iter() {
362                let (p, q) = (bra.zeta, ket.zeta);
363                let pq = p + q;
364                let t = (p * q / pq) * dist2(bra.center, ket.center);
365                let pref = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
366                boys_array(0, t, &mut f0);
367                acc += (bc * ket.c1) * ket.c2 * (pref * f0[0]);
368            }
369        }
370        out[0] += acc;
371        return;
372    }
373
374    // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
375    ensure_len(c_ef, n_e * n_f);
376    c_ef[..n_e * n_f].fill(0.0);
377
378    // Shared Cartesian-triple table, built once (degrees `0..=2·MAX_L` cover every
379    // `ne`/`nf`/`max(ne,nf)` the engine reaches) and sliced below — no per-quartet
380    // `cart_components` allocation.
381    if tri_all.len() <= 2 * MAX_L {
382        *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
383    }
384
385    // Precompute the bra/ket primitive-pair data once (the `exp` prefactor and
386    // product centre depend only on the pair, not the quartet), then loop over the
387    // pair lists. The previous code recomputed the ket pair's `exp`/centre on every
388    // bra primitive — `na·nb` times too many. Bra-outer / ket-inner preserves the
389    // exact accumulation order into `c_ef`.
390    build_pairs(bra_pairs, a, b);
391    build_pairs(ket_pairs, c, d);
392
393    // VRR. The small classes (`ne, nf ≤ 2` — l_total ≤ 4, which carries ~87% of a
394    // cc-pVDZ-style build) dispatch to the monomorphized kernels, where the whole
395    // structural walk (triples, addresses, branches, m-ranges) is resolved at
396    // compile time; everything else runs the general m-marching `vrr_primitive`.
397    // `(0,0)` already returned through the (ss|ss) fast path above.
398    match (ne, nf) {
399        (0, 1) => contract_class::<0, 1>(bra_pairs, ket_pairs, c_ef),
400        (1, 0) => contract_class::<1, 0>(bra_pairs, ket_pairs, c_ef),
401        (1, 1) => contract_class::<1, 1>(bra_pairs, ket_pairs, c_ef),
402        (0, 2) => contract_class::<0, 2>(bra_pairs, ket_pairs, c_ef),
403        (2, 0) => contract_class::<2, 0>(bra_pairs, ket_pairs, c_ef),
404        (1, 2) => contract_class::<1, 2>(bra_pairs, ket_pairs, c_ef),
405        (2, 1) => contract_class::<2, 1>(bra_pairs, ket_pairs, c_ef),
406        (2, 2) => contract_class::<2, 2>(bra_pairs, ket_pairs, c_ef),
407        _ => {
408            // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
409            // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
410            // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
411            // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
412            // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
413            // packed size and `eoff[k·n_e + ae]` is `e`'s offset within it (flat, reused).
414            ensure_usize(slab, nf + 1);
415            ensure_usize(eoff, (nf + 1) * n_e);
416            // `k` indexes both `slab` and the flat `eoff` block base `k·n_e`, so a range
417            // loop is the clear form here.
418            #[allow(clippy::needless_range_loop)]
419            for k in 0..=nf {
420                let base = k * n_e;
421                let mut run = 0usize;
422                let mut ae = 0usize;
423                for de in 0..=ne {
424                    let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
425                    for _ in 0..n_cart(de) {
426                        eoff[base + ae] = run;
427                        ae += 1;
428                        run += mlen;
429                    }
430                }
431                slab[k] = run;
432            }
433            let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
434            ensure_len(levels, 3 * maxlevel);
435            // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
436            // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
437            // instead of silently using a previous quartet's value.
438            #[cfg(debug_assertions)]
439            levels[..3 * maxlevel].fill(f64::NAN);
440
441            // Reborrow the offset/triple buffers as shared slices for the read-only kernel.
442            let (eoff_s, slab_s, tri_s) = (&eoff[..], &slab[..], &tri_all[..]);
443            for bra in bra_pairs.iter() {
444                let bra_coef = bra.c1 * bra.c2; // = c_a·c_b, hoisted out of the ket loop
445                for ket in ket_pairs.iter() {
446                    // ((c_a·c_b)·c_c)·c_d — the former left-to-right product, bit for bit.
447                    let scale = (bra_coef * ket.c1) * ket.c2;
448                    vrr_primitive(
449                        bra.zeta,
450                        ket.zeta,
451                        bra.center,
452                        ket.center,
453                        bra.kappa,
454                        ket.kappa,
455                        bra.r1,
456                        ket.r1,
457                        bra.inv_2zeta,
458                        ket.inv_2zeta,
459                        ne,
460                        nf,
461                        l_total,
462                        n_e,
463                        n_f,
464                        maxlevel,
465                        eoff_s,
466                        slab_s,
467                        tri_s,
468                        levels,
469                        scale,
470                        c_ef,
471                    );
472                }
473            }
474        }
475    }
476
477    // HRR in contracted space, then scatter the block.
478    let ab = sub(a.center, b.center); // A − B
479    let cd = sub(c.center, d.center); // C − D
480    hrr_and_scatter(
481        la,
482        lb,
483        lc,
484        ld,
485        n_f,
486        c_ef,
487        ab,
488        cd,
489        out,
490        bra,
491        bra_prev,
492        bra_cur,
493        ket_prev,
494        ket_cur,
495        &tri_all[..],
496    );
497}
498
499/// Cartesian triples of degree 1 and 2 in [`cart_components`] order, as `const`
500/// tables so the monomorphized small-class VRR kernels fully unroll over them
501/// (the structural walk — axes, addresses, `has2`/cross branches — then resolves
502/// at compile time instead of being re-derived on every primitive quartet).
503const TRI1: [[usize; 3]; 3] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]];
504const TRI2: [[usize; 3]; 6] = [
505    [2, 0, 0],
506    [1, 1, 0],
507    [1, 0, 1],
508    [0, 2, 0],
509    [0, 1, 1],
510    [0, 0, 2],
511];
512/// The e-components `0..n_addr(2)` in [`addr`] order with their degree —
513/// the bra index set of the small-class kernels (`NE ≤ 2`).
514const E_COMP: [([usize; 3], usize); 10] = [
515    ([0, 0, 0], 0),
516    ([1, 0, 0], 1),
517    ([0, 1, 0], 1),
518    ([0, 0, 1], 1),
519    ([2, 0, 0], 2),
520    ([1, 1, 0], 2),
521    ([1, 0, 1], 2),
522    ([0, 2, 0], 2),
523    ([0, 1, 1], 2),
524    ([0, 0, 2], 2),
525];
526
527/// Contract a whole shell quartet of VRR shape `(NE, NF)` (with `NE, NF ≤ 2`)
528/// into `c_ef` using the monomorphized small-class kernel [`vrr_small`].
529///
530/// Same bra-outer / ket-inner primitive loop and the same left-to-right
531/// `((c_a·c_b)·c_c)·c_d` scale as the general path, so the accumulation into
532/// `c_ef` is bit-identical. The VRR working tables live here (overwritten per
533/// primitive quartet in the region read, like the general path's `levels`
534/// arena) so their one-time zero-init is per shell quartet, not per primitive.
535fn contract_class<const NE: usize, const NF: usize>(
536    bra_pairs: &[PrimPair],
537    ket_pairs: &[PrimPair],
538    c_ef: &mut [f64],
539) {
540    // Worst-case (NE,NF)=(2,2) sizes: e-table 10 components × m ∈ 0..=4;
541    // f-degree-1 level 10×3 × m ∈ 0..=3; f-degree-2 level 10×6 × m ∈ 0..=2.
542    let mut ea = [0.0f64; 50];
543    let mut eb1 = [0.0f64; 120];
544    let mut eb2 = [0.0f64; 180];
545    for bra in bra_pairs {
546        let bra_coef = bra.c1 * bra.c2;
547        for ket in ket_pairs {
548            let scale = (bra_coef * ket.c1) * ket.c2;
549            vrr_small::<NE, NF>(bra, ket, scale, &mut ea, &mut eb1, &mut eb2, c_ef);
550        }
551    }
552}
553
554/// One primitive quartet of the OS VRR for the small classes `NE, NF ≤ 2`,
555/// monomorphized per `(NE, NF)` — the per-L-class "generated kernel" idea of
556/// libcint/libint expressed through const generics: every loop bound, Cartesian
557/// triple, address, and `has2`/cross branch below is a compile-time constant in
558/// each instantiation, so the emitted code is a straight unrolled FMA chain.
559///
560/// **Bit-identical to [`vrr_primitive`] by construction**: the same recurrence
561/// expressions with the same term order and association — base
562/// `pref·F_m`, raise `pa_i·s1[m] + wp_i·s1[m+1]`, then `+= coef2·(s2[m] −
563/// (q/pq)·s2[m+1])`, then `+= cross·s3[m+1]` — the same m-ranges
564/// (`m ≤ l_total − de − k`), and the same `c_ef[ae·n_f + af] += scale·v`
565/// extraction. Only the storage differs (fixed per-degree tables instead of the
566/// triangle-packed rolling window), which does not touch the arithmetic.
567/// Guarded by the full-tensor XOR fingerprint and the golden/cross-engine tests.
568#[inline(always)]
569#[allow(clippy::too_many_arguments)]
570fn vrr_small<const NE: usize, const NF: usize>(
571    bra: &PrimPair,
572    ket: &PrimPair,
573    scale: f64,
574    ea: &mut [f64; 50],
575    eb1: &mut [f64; 120],
576    eb2: &mut [f64; 180],
577    c_ef: &mut [f64],
578) {
579    let lt = NE + NF;
580    let n_e = n_addr(NE);
581    let n_f = n_addr(NF);
582    let (p, q) = (bra.zeta, ket.zeta);
583    let (pc, qc) = (bra.center, ket.center);
584    let pa = bra.r1; // P − A
585    let qcen = ket.r1; // Q − C
586    let (inv_2p, inv_2q) = (bra.inv_2zeta, ket.inv_2zeta);
587
588    // Scalar header — identical expressions to `vrr_primitive`.
589    let pq = p + q;
590    let rho = p * q / pq;
591    let pq_vec = sub(pc, qc); // P − Q
592    let t = rho * norm2(pq_vec);
593    let w = [
594        (p * pc[0] + q * qc[0]) / pq,
595        (p * pc[1] + q * qc[1]) / pq,
596        (p * pc[2] + q * qc[2]) / pq,
597    ];
598    let wp = sub(w, pc); // W − P
599    let wq = sub(w, qc); // W − Q
600    use std::f64::consts::PI;
601    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
602    let mut fm = [0.0f64; 5];
603    boys_array(lt, t, &mut fm[..=lt]);
604    for m in 0..=lt {
605        ea[m] = pref * fm[m];
606    }
607
608    let inv_2pq = 0.5 / pq;
609    let q_over_pq = q / pq;
610    let p_over_pq = p / pq;
611
612    // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
613    if NE >= 1 {
614        for (iw, te) in TRI1.iter().enumerate() {
615            let i = lower_axis(*te);
616            // Source is [0,0,0] at addr 0; no has2 term at degree 1.
617            for m in 0..=(lt - 1) {
618                ea[(1 + iw) * 5 + m] = pa[i] * ea[m] + wp[i] * ea[m + 1];
619            }
620        }
621    }
622    if NE >= 2 {
623        for (iw, te) in TRI2.iter().enumerate() {
624            let i = lower_axis(*te);
625            let s1 = addr(dec(*te, i));
626            let has2 = te[i] >= 2;
627            let coef2 = ((te[i] - 1) as f64) * inv_2p;
628            let s2 = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
629            for m in 0..=(lt - 2) {
630                let mut v = pa[i] * ea[s1 * 5 + m] + wp[i] * ea[s1 * 5 + m + 1];
631                if has2 {
632                    v += coef2 * (ea[s2 * 5 + m] - q_over_pq * ea[s2 * 5 + m + 1]);
633                }
634                ea[(4 + iw) * 5 + m] = v;
635            }
636        }
637    }
638    // Extract f-degree 0: contract m = 0 of every e-component.
639    for ae in 0..n_e {
640        c_ef[ae * n_f] += scale * ea[ae * 5];
641    }
642
643    // Phase B — ket raise, f-degree 1: m ≤ lt − de − 1.
644    if NF >= 1 {
645        for (fw, tf) in TRI1.iter().enumerate() {
646            let j = lower_axis(*tf);
647            // Source f-component is [0,0,0] (level 0 = the phase-A table).
648            for ae in 0..n_e {
649                let (te, de) = E_COMP[ae];
650                let has_cross = te[j] >= 1;
651                let cross_coef = te[j] as f64 * inv_2pq;
652                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
653                for m in 0..=(lt - de - 1) {
654                    let mut v = qcen[j] * ea[ae * 5 + m] + wq[j] * ea[ae * 5 + m + 1];
655                    if has_cross {
656                        v += cross_coef * ea[cs * 5 + m + 1];
657                    }
658                    eb1[(ae * 3 + fw) * 4 + m] = v;
659                }
660            }
661        }
662        for ae in 0..n_e {
663            for fw in 0..3 {
664                c_ef[ae * n_f + 1 + fw] += scale * eb1[(ae * 3 + fw) * 4];
665            }
666        }
667    }
668
669    // Phase B — ket raise, f-degree 2: m ≤ lt − de − 2.
670    if NF >= 2 {
671        for (fw, tf) in TRI2.iter().enumerate() {
672            let j = lower_axis(*tf);
673            let f1 = dec(*tf, j);
674            let lf1 = cart_index(f1); // f-component index in the degree-1 level
675            let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
676            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
677            for ae in 0..n_e {
678                let (te, de) = E_COMP[ae];
679                let has_cross = te[j] >= 1;
680                let cross_coef = te[j] as f64 * inv_2pq;
681                let cs = if has_cross { addr(dec(te, j)) } else { 0 };
682                for m in 0..=(lt - de - 2) {
683                    let mut v = qcen[j] * eb1[(ae * 3 + lf1) * 4 + m]
684                        + wq[j] * eb1[(ae * 3 + lf1) * 4 + m + 1];
685                    if has2 {
686                        v += coef2 * (ea[ae * 5 + m] - p_over_pq * ea[ae * 5 + m + 1]);
687                    }
688                    if has_cross {
689                        v += cross_coef * eb1[(cs * 3 + lf1) * 4 + m + 1];
690                    }
691                    eb2[(ae * 6 + fw) * 3 + m] = v;
692                }
693            }
694        }
695        for ae in 0..n_e {
696            for fw in 0..6 {
697                c_ef[ae * n_f + 4 + fw] += scale * eb2[(ae * 6 + fw) * 3];
698            }
699        }
700    }
701}
702
703/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
704/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
705///
706/// Algebraically identical to the former full-table VRR (same OS recurrences, same
707/// term order) — only the storage changes, so it is bit-identical
708/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
709/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
710/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
711/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
712/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
713/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
714/// buffer is recycled.
715#[allow(clippy::too_many_arguments)]
716fn vrr_primitive(
717    p: f64,
718    q: f64,
719    pc: Vec3,
720    qc: Vec3,
721    kab: f64,
722    kcd: f64,
723    pa: Vec3,    // P − A  (bra pair, precomputed)
724    qcen: Vec3,  // Q − C  (ket pair, precomputed)
725    inv_2p: f64, // 1/(2p) (bra pair, precomputed)
726    inv_2q: f64, // 1/(2q) (ket pair, precomputed)
727    ne: usize,
728    nf: usize,
729    l_total: usize,
730    n_e: usize,
731    n_f: usize,
732    maxlevel: usize,
733    eoff: &[usize],
734    slab: &[usize],
735    tri: &[Vec<[usize; 3]>],
736    levels: &mut [f64],
737    scale: f64,
738    c_ef: &mut [f64],
739) {
740    let pq = p + q;
741    let rho = p * q / pq;
742    let pq_vec = sub(pc, qc); // P − Q
743    let t = rho * norm2(pq_vec);
744    let w = [
745        (p * pc[0] + q * qc[0]) / pq,
746        (p * pc[1] + q * qc[1]) / pq,
747        (p * pc[2] + q * qc[2]) / pq,
748    ];
749    // `pa` (P−A) and `qcen` (Q−C) are now passed in (pure per-pair).
750    let wp = sub(w, pc); // W − P
751    let wq = sub(w, qc); // W − Q
752
753    // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
754    // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
755    // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
756    // source and destination across the distinct slots `k`, `k−1`, `k−2`.
757    let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
758        (k % 3) * maxlevel + lf * slab[k] + eoff[k * n_e + ae] + m
759    };
760
761    // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
762    // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
763    // bound; an ERI couples both electrons).
764    use std::f64::consts::PI;
765    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
766    let mut fm = [0.0f64; 4 * MAX_L + 1];
767    boys_array(l_total, t, &mut fm[..=l_total]);
768    for m in 0..=l_total {
769        levels[off(0, 0, 0, m)] = pref * fm[m];
770    }
771
772    // `inv_2p` / `inv_2q` are now passed in (pure per-pair).
773    let inv_2pq = 0.5 / pq;
774    let q_over_pq = q / pq;
775    let p_over_pq = p / pq;
776
777    // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
778    // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
779    // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
780    let eoff0 = &eoff[..n_e];
781    for (na, te_list) in tri.iter().enumerate().take(ne + 1).skip(1) {
782        for &te in te_list {
783            let i = lower_axis(te);
784            let s1a = addr(dec(te, i));
785            let mmax = l_total - na;
786            let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
787            let has2 = te[i] >= 2;
788            let s1_0 = eoff0[s1a];
789            let s2_0 = if has2 {
790                eoff0[addr(dec(dec(te, i), i))]
791            } else {
792                0
793            };
794            let dst_0 = eoff0[addr(te)];
795            for m in 0..=mmax {
796                let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
797                if has2 {
798                    v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
799                }
800                levels[dst_0 + m] = v;
801            }
802        }
803    }
804    // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
805    for ae in 0..n_e {
806        c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
807    }
808
809    // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
810    // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
811    // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
812    // the former full-table inner-loop cost. The k−2 level is read only when the axis
813    // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
814    for k in 1..=nf {
815        let slot_k = (k % 3) * maxlevel;
816        let slot_k1 = ((k - 1) % 3) * maxlevel;
817        let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
818        let eoff_k = &eoff[k * n_e..];
819        let eoff_k1 = &eoff[(k - 1) * n_e..];
820        let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
821            (
822                ((k - 2) % 3) * maxlevel,
823                slab[k - 2],
824                &eoff[(k - 2) * n_e..],
825            )
826        } else {
827            (0, 0, &[])
828        };
829        for &tf in &tri[k] {
830            let j = lower_axis(tf);
831            let f1 = dec(tf, j);
832            let lf1 = cart_index(f1); // local index in level k−1
833            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
834            let has2 = tf[j] >= 2;
835            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
836            let lf = cart_index(tf); // local index in level k
837            let sk = slot_k + lf * slab_k;
838            let sk1 = slot_k1 + lf1 * slab_k1;
839            let sk2 = slot_k2 + lf2 * slab_k2;
840            for (nadeg, te_list) in tri.iter().enumerate().take(ne + 1) {
841                for &te in te_list {
842                    let ea = addr(te);
843                    // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
844                    let has_cross = te[j] >= 1;
845                    let (cross_coef, cross_0) = if has_cross {
846                        (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
847                    } else {
848                        (0.0, 0)
849                    };
850                    let mmax = l_total - nadeg - k;
851                    let dst_0 = sk + eoff_k[ea];
852                    let src1_0 = sk1 + eoff_k1[ea];
853                    let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
854                    for m in 0..=mmax {
855                        let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
856                        if has2 {
857                            v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
858                        }
859                        if has_cross {
860                            v += cross_coef * levels[cross_0 + m + 1];
861                        }
862                        levels[dst_0 + m] = v;
863                    }
864                }
865            }
866        }
867        // Extract level k: contract each f-component's m = 0 slice into c_ef.
868        for &tf in &tri[k] {
869            let lf = cart_index(tf);
870            let af = addr(tf);
871            for ae in 0..n_e {
872                c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
873            }
874        }
875    }
876}
877
878/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
879/// the row-major output block.
880///
881/// Flat-array HGP horizontal recurrence, replacing the earlier
882/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
883/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
884/// term order — so it is **bit-identical** to the recursive version (guarded by
885/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
886/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
887///
888/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
889/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
890/// independently per ket index `f` (a spectator), into the `bra` intermediate.
891/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
892/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
893/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
894/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
895#[allow(clippy::too_many_arguments)]
896fn hrr_and_scatter<'a>(
897    la: usize,
898    lb: usize,
899    lc: usize,
900    ld: usize,
901    n_f: usize,
902    c_ef: &[f64],
903    ab: Vec3,
904    cd: Vec3,
905    out: &mut [f64],
906    bra: &mut Vec<f64>,
907    mut prev: &'a mut Vec<f64>,
908    mut cur: &'a mut Vec<f64>,
909    mut kprev: &'a mut Vec<f64>,
910    mut kcur: &'a mut Vec<f64>,
911    // Shared Cartesian-triple table (degrees `0..=2·MAX_L`); HRR indexes it up to
912    // `max(ne, nf)`. Passed in so it is not re-`collect`ed per quartet.
913    tri: &[Vec<[usize; 3]>],
914) {
915    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
916    let ne = la + lb; // max bra (A-side) degree present in c_ef
917    let nf = lc + ld; // max ket (C-side) degree present in c_ef
918    let n_e = n_addr(ne);
919
920    // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
921    // the ket base are keyed by the global `addr(f)` offset by `f_base`.
922    let f_base = tri_below(lc);
923    let nf_range = n_f - f_base;
924
925    // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
926    // Reused arena buffers; bra is fully overwritten below, the rolling
927    // layers in the region read — debug NaN-fill bra so a missed write surfaces.
928    let bra_len = na * nb * nf_range;
929    ensure_len(bra, bra_len);
930    #[cfg(debug_assertions)]
931    bra[..bra_len].fill(f64::NAN);
932    // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
933    let layer_len = n_cart(lb) * n_e;
934    ensure_len(prev, layer_len);
935    ensure_len(cur, layer_len);
936    for f_global in f_base..n_f {
937        let jf = f_global - f_base;
938        // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
939        for &a in tri[la..=ne].iter().flatten() {
940            let ae = addr(a);
941            prev[ae] = c_ef[ae * n_f + f_global];
942        }
943        for kb in 1..=lb {
944            for (ibw, &b) in tri[kb].iter().enumerate() {
945                let i = lower_axis(b);
946                let b1w = cart_index(dec(b, i));
947                // a-degrees that can still reach (la, lb): [la..=ne−kb].
948                for &a in tri[la..=(ne - kb)].iter().flatten() {
949                    let ae = addr(a);
950                    let a1e = addr(inc(a, i));
951                    cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
952                }
953            }
954            std::mem::swap(&mut prev, &mut cur);
955        }
956        // `prev` now holds b-degree lb (or the base when lb = 0): extract.
957        for (ib, &b) in tri[lb].iter().enumerate() {
958            let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
959            for (ia, &a) in tri[la].iter().enumerate() {
960                bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
961            }
962        }
963    }
964
965    // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
966    let klayer_len = n_cart(ld) * n_f;
967    ensure_len(kprev, klayer_len);
968    ensure_len(kcur, klayer_len);
969    for ia in 0..na {
970        for ib in 0..nb {
971            let brarow = (ia * nb + ib) * nf_range;
972            // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
973            for &c in tri[lc..=nf].iter().flatten() {
974                let ce = addr(c);
975                kprev[ce] = bra[brarow + (ce - f_base)];
976            }
977            for kd in 1..=ld {
978                for (idw, &d) in tri[kd].iter().enumerate() {
979                    let j = lower_axis(d);
980                    let d1w = cart_index(dec(d, j));
981                    for &c in tri[lc..=(nf - kd)].iter().flatten() {
982                        let ce = addr(c);
983                        let c1e = addr(inc(c, j));
984                        kcur[idw * n_f + ce] =
985                            kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
986                    }
987                }
988                std::mem::swap(&mut kprev, &mut kcur);
989            }
990            // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
991            for (ic, &c) in tri[lc].iter().enumerate() {
992                let ce = addr(c);
993                for id in 0..nd {
994                    out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
995                }
996            }
997        }
998    }
999}
1000
1001// --- small helpers ---
1002
1003/// First nonzero axis of a triple (the lowering direction).
1004#[inline]
1005fn lower_axis(t: [usize; 3]) -> usize {
1006    if t[0] > 0 {
1007        0
1008    } else if t[1] > 0 {
1009        1
1010    } else {
1011        2
1012    }
1013}
1014
1015#[inline]
1016fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
1017    t[i] -= 1;
1018    t
1019}
1020
1021#[inline]
1022fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
1023    t[i] += 1;
1024    t
1025}
1026
1027#[inline]
1028fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
1029    [
1030        (a * ca[0] + b * cb[0]) / p,
1031        (a * ca[1] + b * cb[1]) / p,
1032        (a * ca[2] + b * cb[2]) / p,
1033    ]
1034}
1035
1036#[inline]
1037fn sub(u: Vec3, v: Vec3) -> Vec3 {
1038    [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
1039}
1040
1041#[inline]
1042fn dist2(u: Vec3, v: Vec3) -> f64 {
1043    norm2(sub(u, v))
1044}
1045
1046#[inline]
1047fn norm2(u: Vec3) -> f64 {
1048    u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
1049}
1050
1051#[cfg(test)]
1052mod tests {
1053    use super::*;
1054
1055    /// Single-primitive `ShellRef` helper.
1056    fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
1057        (center, l, [exp], [1.0])
1058    }
1059
1060    /// (ss|ss) over four unit s primitives must equal the closed form
1061    /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
1062    /// engine passes, so the two share a verified base case.
1063    #[test]
1064    fn ssss_matches_closed_form() {
1065        let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
1066        let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
1067        let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
1068        let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
1069        let mut out = [0.0; 1];
1070        coulomb_shell_into(
1071            ShellRef {
1072                center: ac,
1073                l: al,
1074                exps: &ae,
1075                coeffs: &acf,
1076            },
1077            ShellRef {
1078                center: bc,
1079                l: bl,
1080                exps: &be,
1081                coeffs: &bcf,
1082            },
1083            ShellRef {
1084                center: cc,
1085                l: cl,
1086                exps: &ce,
1087                coeffs: &ccf,
1088            },
1089            ShellRef {
1090                center: dc,
1091                l: dl,
1092                exps: &de,
1093                coeffs: &dcf,
1094            },
1095            &mut out,
1096        );
1097
1098        let p = 0.8 + 1.3;
1099        let q = 0.5 + 1.1;
1100        let pcen = combine(0.8, ac, 1.3, bc, p);
1101        let qcen = combine(0.5, cc, 1.1, dc, q);
1102        let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
1103        let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
1104        let rho = p * q / (p + q);
1105        let t = rho * dist2(pcen, qcen);
1106        let mut fm = [0.0; 1];
1107        boys_array(0, t, &mut fm);
1108        use std::f64::consts::PI;
1109        let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
1110        assert!(
1111            (out[0] - expect).abs() < 1e-14 * expect.abs(),
1112            "ssss {} vs {}",
1113            out[0],
1114            expect
1115        );
1116    }
1117
1118    use crate::os::Prim;
1119    use crate::rys::coulomb_into;
1120
1121    /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
1122    /// center)`, for cross-checking against the Rys engine.
1123    #[allow(clippy::too_many_arguments)]
1124    fn os_block(
1125        la: usize,
1126        ea: f64,
1127        ca: Vec3,
1128        lb: usize,
1129        eb: f64,
1130        cb: Vec3,
1131        lc: usize,
1132        recc: f64,
1133        ccc: Vec3,
1134        ld: usize,
1135        ed: f64,
1136        cdd: Vec3,
1137    ) -> Vec<f64> {
1138        let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
1139        let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
1140        let one = [1.0];
1141        coulomb_shell_into(
1142            ShellRef {
1143                center: ca,
1144                l: la,
1145                exps: &ea1,
1146                coeffs: &one,
1147            },
1148            ShellRef {
1149                center: cb,
1150                l: lb,
1151                exps: &eb1,
1152                coeffs: &one,
1153            },
1154            ShellRef {
1155                center: ccc,
1156                l: lc,
1157                exps: &ec1,
1158                coeffs: &one,
1159            },
1160            ShellRef {
1161                center: cdd,
1162                l: ld,
1163                exps: &ed1,
1164                coeffs: &one,
1165            },
1166            &mut out,
1167        );
1168        out
1169    }
1170
1171    /// OS/HGP must reproduce the Rys engine element-for-element on a single
1172    /// primitive quartet, across a sweep of angular momenta (including the
1173    /// bug-prone mixed-high-L and "all four different L on four centers" cases).
1174    #[test]
1175    fn matches_rys_engine_primitive_sweep() {
1176        let ca = [0.0, 0.0, 0.0];
1177        let cb = [0.5, -0.3, 0.2];
1178        let cc = [-0.4, 0.6, -0.1];
1179        let cd = [0.2, 0.4, 0.8];
1180        let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);
1181
1182        let quartets = [
1183            (0usize, 0usize, 0usize, 0usize),
1184            (1, 0, 0, 0),
1185            (0, 0, 1, 0),
1186            (1, 1, 0, 0),
1187            (1, 0, 1, 0),
1188            (1, 1, 1, 1),
1189            (2, 0, 0, 0),
1190            (2, 1, 0, 0),
1191            (2, 1, 2, 1),
1192            (0, 1, 2, 3), // four different L on four centers
1193            (2, 2, 3, 3), // (dd|ff) mixed high-L
1194            (3, 0, 0, 1),
1195            // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
1196            // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
1197            // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
1198            (4, 4, 4, 1), // l_total = 13
1199            (6, 6, 1, 0), // l_total = 13, two i-shells
1200            (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
1201        ];
1202        for (la, lb, lc, ld) in quartets {
1203            let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
1204            let mut rys = vec![0.0; os.len()];
1205            coulomb_into(
1206                Prim::new(ea, ca, la),
1207                Prim::new(eb, cb, lb),
1208                Prim::new(ec, cc, lc),
1209                Prim::new(ed, cd, ld),
1210                1.0,
1211                &mut rys,
1212            );
1213            assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
1214        }
1215    }
1216
1217    /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
1218    /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
1219    /// The atol floor absorbs benign near-cancellation on structurally tiny
1220    /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
1221    /// even though both engines are correct — their dominant elements agree to
1222    /// ~1e-12); the rtol catches any real divergence.
1223    fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
1224        const ATOL: f64 = 1e-11;
1225        const RTOL: f64 = 1e-10;
1226        for (o, r) in os.iter().zip(rys.iter()) {
1227            assert!(
1228                (o - r).abs() <= ATOL + RTOL * r.abs(),
1229                "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
1230                (o - r).abs()
1231            );
1232        }
1233    }
1234
1235    /// HGP early contraction (VRR per primitive, HRR once in contracted space)
1236    /// must equal the per-primitive Rys sum for a genuinely **contracted**
1237    /// quartet.
1238    ///
1239    /// Note (`DESIGN_NOTES.md` D6): doing the HRR after contraction is a *performance*
1240    /// choice, **not** a correctness fork. The HRR operator is linear with
1241    /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
1242    /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
1243    /// and post-contraction HRR give the *same* answer. This remains a valuable
1244    /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
1245    /// coefficient, or a wrong HRR sign would all fail it), but it does not
1246    /// distinguish HRR *ordering*, because the two orders are algebraically equal.
1247    #[test]
1248    fn contracted_quartet_matches_rys_sum() {
1249        // Two p shells and a d shell, each with multiple primitives, on distinct
1250        // centres (so HRR shift vectors A−B, C−D are all non-trivial).
1251        let ca = [0.0, 0.0, 0.0];
1252        let cb = [0.6, -0.2, 0.1];
1253        let cc = [-0.3, 0.5, -0.2];
1254        let cd = [0.2, 0.3, 0.7];
1255        let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
1256        let ax = [1.4, 0.45];
1257        let acf = [0.6, 0.5];
1258        let bx = [0.9, 0.3];
1259        let bcf = [0.55, 0.5];
1260        let cx = [1.1, 0.4];
1261        let ccf = [0.7, 0.4];
1262        let dx = [0.8];
1263        let dcf = [1.0];
1264
1265        let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
1266        coulomb_shell_into(
1267            ShellRef {
1268                center: ca,
1269                l: la,
1270                exps: &ax,
1271                coeffs: &acf,
1272            },
1273            ShellRef {
1274                center: cb,
1275                l: lb,
1276                exps: &bx,
1277                coeffs: &bcf,
1278            },
1279            ShellRef {
1280                center: cc,
1281                l: lc,
1282                exps: &cx,
1283                coeffs: &ccf,
1284            },
1285            ShellRef {
1286                center: cd,
1287                l: ld,
1288                exps: &dx,
1289                coeffs: &dcf,
1290            },
1291            &mut os,
1292        );
1293
1294        // Reference: sum the Rys primitive engine over the quartet with the same
1295        // effective coefficients.
1296        let mut rys = vec![0.0; os.len()];
1297        for (&ea, &wa) in ax.iter().zip(acf.iter()) {
1298            for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
1299                for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
1300                    for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
1301                        coulomb_into(
1302                            Prim::new(ea, ca, la),
1303                            Prim::new(eb, cb, lb),
1304                            Prim::new(ec, cc, lc),
1305                            Prim::new(ed, cd, ld),
1306                            wa * wb * wc * wd,
1307                            &mut rys,
1308                        );
1309                    }
1310                }
1311            }
1312        }
1313
1314        for (o, r) in os.iter().zip(rys.iter()) {
1315            assert!(
1316                (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
1317                "contracted OS vs Rys mismatch: {o} vs {r}"
1318            );
1319        }
1320    }
1321
1322    /// The triple-address map must be a bijection onto `0..n_addr(L)`.
1323    #[test]
1324    fn addr_is_bijective() {
1325        for lmax in 0..=6 {
1326            let mut seen = vec![false; n_addr(lmax)];
1327            for n in 0..=lmax {
1328                for t in cart_components(n) {
1329                    let a = addr(t);
1330                    assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
1331                    assert!(!seen[a], "addr collision at {t:?}");
1332                    seen[a] = true;
1333                }
1334            }
1335            assert!(
1336                seen.iter().all(|&x| x),
1337                "addr not surjective at lmax {lmax}"
1338            );
1339        }
1340    }
1341}