Skip to main content

integral_core/
os_eri.rs

1//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
2//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
3//!
4//! A **vertical recurrence (VRR)** builds the
5//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
6//! the primitives are **contracted** into AO space, and a **horizontal recurrence
7//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
8//! Doing the HRR after contraction is the HGP "early contraction" trick: the
9//! geometry-only HRR runs **once per shell quartet** instead of once per
10//! primitive quartet, which is the win at high contraction degree.
11//!
12//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
13//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
14//! against it (and against an independent McMurchie–Davidson path).
15//!
16//! ## Method
17//!
18//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
19//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
20//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
21//!
22//! ```text
23//!   [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
24//! ```
25//!
26//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
27//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
28//!
29//! ```text
30//!   [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
31//!       + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
32//!       + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
33//!   [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
34//!       + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
35//!       + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
36//! ```
37//!
38//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
39//! Cartesian shell block:
40//!
41//! ```text
42//!   (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0)   [bra, A→B]
43//!   (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d)   [ket, C→D]
44//! ```
45//!
46//! ## Buffers (see `DESIGN_NOTES.md` D3, D12)
47//!
48//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
49//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
50//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
51//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
52//! `(ii|ii)`. The current engine (D12) removes that:
53//!
54//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
55//!   `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
56//!   of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
57//!   index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
58//!   `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
59//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
60//!   / [`cart_index`] with two rolling degree layers, replacing the former HashMap
61//!   memoisation.
62//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
63//!   across quartets (thread-local by default), not re-allocated per quartet.
64//!
65//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
66//! former full-table engine's values, cross-checked against the Rys engine and an
67//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).
68
69use std::cell::RefCell;
70
71use integral_math::am::{cart_components, cart_index, n_cart};
72use integral_math::boys::boys_array;
73
74use crate::os::{Vec3, MAX_L};
75
76/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
77/// momentum, and primitive `(exponent, effective-coefficient)` data. The
78/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
79/// engine itself works on un-normalised monomials and only multiplies the four
80/// coefficients into the contracted accumulator.
81#[derive(Debug, Clone, Copy)]
82pub struct ShellRef<'a> {
83    /// Shell centre (bohr).
84    pub center: Vec3,
85    /// Angular momentum.
86    pub l: usize,
87    /// Primitive exponents.
88    pub exps: &'a [f64],
89    /// Effective contraction coefficients, aligned with `exps`.
90    pub coeffs: &'a [f64],
91}
92
93/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
94#[inline]
95fn n_addr(lmax: usize) -> usize {
96    (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
97}
98
99/// Number of Cartesian triples of degree **strictly less than** `d`:
100/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
101/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
102#[inline]
103fn tri_below(d: usize) -> usize {
104    d * (d + 1) * (d + 2) / 6
105}
106
107/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
108/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
109/// `0..n_addr(deg)`.
110#[inline]
111fn addr(t: [usize; 3]) -> usize {
112    let n = t[0] + t[1] + t[2];
113    let base = n * (n + 1) * (n + 2) / 6;
114    // within-degree index: (n-lx)(n-lx+1)/2 + lz
115    let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
116    base + within
117}
118
119/// Reusable scratch arena for the OS/HGP ERI engine.
120///
121/// Holds the engine's working buffers so they are allocated **once and reused**
122/// across shell quartets, instead of a fresh heap allocation per quartet inside the
123/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
124/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
125/// thread uses its own instance — the thread-local default behind
126/// [`coulomb_shell_into`], or one passed explicitly to
127/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
128/// error).
129///
130/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
131/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
132/// they are later read, so they need no functional zeroing — but in debug builds
133/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
134/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
135/// rather than silently reading a stale value from a previous quartet. Results are
136/// therefore independent of evaluation order and of arena reuse (asserted by
137/// `tests/arena.rs`).
138#[derive(Debug, Default, Clone)]
139pub struct EriScratch {
140    /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
141    levels: Vec<f64>,
142    /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
143    c_ef: Vec<f64>,
144    /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
145    bra: Vec<f64>,
146    /// HRR bra rolling `b`-degree layers.
147    bra_prev: Vec<f64>,
148    bra_cur: Vec<f64>,
149    /// HRR ket rolling `d`-degree layers.
150    ket_prev: Vec<f64>,
151    ket_cur: Vec<f64>,
152}
153
154impl EriScratch {
155    /// A fresh, empty arena; it grows to fit on first use.
156    #[must_use]
157    pub fn new() -> Self {
158        Self::default()
159    }
160
161    /// Total `f64` elements currently held across all buffers — the resident
162    /// working set, for memory reporting/tests.
163    #[must_use]
164    pub fn resident_f64(&self) -> usize {
165        self.levels.len()
166            + self.c_ef.len()
167            + self.bra.len()
168            + self.bra_prev.len()
169            + self.bra_cur.len()
170            + self.ket_prev.len()
171            + self.ket_cur.len()
172    }
173
174    /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
175    /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
176    /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
177    #[must_use]
178    pub fn largest_buffer_f64(&self) -> usize {
179        [
180            self.levels.len(),
181            self.c_ef.len(),
182            self.bra.len(),
183            self.bra_prev.len(),
184            self.bra_cur.len(),
185            self.ket_prev.len(),
186            self.ket_cur.len(),
187        ]
188        .into_iter()
189        .max()
190        .unwrap_or(0)
191    }
192}
193
194thread_local! {
195    /// Per-thread default arena backing [`coulomb_shell_into`].
196    static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
197}
198
199/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
200#[inline]
201fn ensure_len(v: &mut Vec<f64>, n: usize) {
202    if v.len() < n {
203        v.resize(n, 0.0);
204    }
205}
206
207/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
208/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
209/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
210///
211/// The engine owns the primitive-quartet loop (the HGP contraction happens between
212/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
213/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
214/// explicit control (e.g. one arena per worker thread) call
215/// [`coulomb_shell_into_scratch`].
216///
217/// # Panics
218/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
219pub fn coulomb_shell_into(
220    a: ShellRef<'_>,
221    b: ShellRef<'_>,
222    c: ShellRef<'_>,
223    d: ShellRef<'_>,
224    out: &mut [f64],
225) {
226    ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
227}
228
229/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
230/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
231/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
232/// compile error); the result is bit-identical regardless of which arena is used or
233/// what it last held.
234///
235/// # Panics
236/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
237pub fn coulomb_shell_into_scratch(
238    scratch: &mut EriScratch,
239    a: ShellRef<'_>,
240    b: ShellRef<'_>,
241    c: ShellRef<'_>,
242    d: ShellRef<'_>,
243    out: &mut [f64],
244) {
245    let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
246    debug_assert!(
247        la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
248        "angular momentum exceeds MAX_L"
249    );
250    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
251    debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");
252
253    let ne = la + lb; // max bra (A-side) degree built by VRR
254    let nf = lc + ld; // max ket (C-side) degree built by VRR
255    let l_total = ne + nf;
256    let n_e = n_addr(ne);
257    let n_f = n_addr(nf);
258
259    let EriScratch {
260        levels,
261        c_ef,
262        bra,
263        bra_prev,
264        bra_cur,
265        ket_prev,
266        ket_cur,
267    } = scratch;
268
269    // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
270    ensure_len(c_ef, n_e * n_f);
271    c_ef[..n_e * n_f].fill(0.0);
272
273    // Precompute the triples for each degree once (canonical order).
274    let tri_e: Vec<Vec<[usize; 3]>> = (0..=ne).map(cart_components).collect();
275    let tri_f: Vec<Vec<[usize; 3]>> = (0..=nf).map(cart_components).collect();
276
277    // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
278    // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
279    // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
280    // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
281    // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
282    // packed size and `eoff[k][ae]` is `e`'s offset within it (see `vrr_primitive`).
283    let mut eoff: Vec<Vec<usize>> = Vec::with_capacity(nf + 1);
284    let mut slab: Vec<usize> = Vec::with_capacity(nf + 1);
285    for k in 0..=nf {
286        let mut off = Vec::with_capacity(n_e);
287        let mut run = 0usize;
288        for de in 0..=ne {
289            let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
290            for _ in 0..n_cart(de) {
291                off.push(run);
292                run += mlen;
293            }
294        }
295        eoff.push(off);
296        slab.push(run);
297    }
298    let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
299    ensure_len(levels, 3 * maxlevel);
300    // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
301    // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
302    // instead of silently using a previous quartet's value.
303    #[cfg(debug_assertions)]
304    levels[..3 * maxlevel].fill(f64::NAN);
305
306    for (&ea, &ca) in a.exps.iter().zip(a.coeffs.iter()) {
307        for (&eb, &cb) in b.exps.iter().zip(b.coeffs.iter()) {
308            let p = ea + eb;
309            let pc = combine(ea, a.center, eb, b.center, p);
310            let kab = (-(ea * eb / p) * dist2(a.center, b.center)).exp();
311            for (&ec, &cc) in c.exps.iter().zip(c.coeffs.iter()) {
312                for (&ed, &cd) in d.exps.iter().zip(d.coeffs.iter()) {
313                    let q = ec + ed;
314                    let qc = combine(ec, c.center, ed, d.center, q);
315                    let kcd = (-(ec * ed / q) * dist2(c.center, d.center)).exp();
316                    let scale = ca * cb * cc * cd;
317                    vrr_primitive(
318                        p, q, pc, qc, kab, kcd, a.center, c.center, ne, nf, l_total, n_e, n_f,
319                        maxlevel, &eoff, &slab, &tri_e, &tri_f, levels, scale, c_ef,
320                    );
321                }
322            }
323        }
324    }
325
326    // HRR in contracted space, then scatter the block.
327    let ab = sub(a.center, b.center); // A − B
328    let cd = sub(c.center, d.center); // C − D
329    hrr_and_scatter(
330        la, lb, lc, ld, n_f, c_ef, ab, cd, out, bra, bra_prev, bra_cur, ket_prev, ket_cur,
331    );
332}
333
334/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
335/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
336///
337/// Algebraically identical to the former full-table VRR (same OS recurrences, same
338/// term order) — only the storage changes, so it is bit-identical
339/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
340/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
341/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
342/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
343/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
344/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
345/// buffer is recycled.
346#[allow(clippy::too_many_arguments)]
347fn vrr_primitive(
348    p: f64,
349    q: f64,
350    pc: Vec3,
351    qc: Vec3,
352    kab: f64,
353    kcd: f64,
354    a_center: Vec3,
355    c_center: Vec3,
356    ne: usize,
357    nf: usize,
358    l_total: usize,
359    n_e: usize,
360    n_f: usize,
361    maxlevel: usize,
362    eoff: &[Vec<usize>],
363    slab: &[usize],
364    tri_e: &[Vec<[usize; 3]>],
365    tri_f: &[Vec<[usize; 3]>],
366    levels: &mut [f64],
367    scale: f64,
368    c_ef: &mut [f64],
369) {
370    let pq = p + q;
371    let rho = p * q / pq;
372    let pq_vec = sub(pc, qc); // P − Q
373    let t = rho * norm2(pq_vec);
374    let w = [
375        (p * pc[0] + q * qc[0]) / pq,
376        (p * pc[1] + q * qc[1]) / pq,
377        (p * pc[2] + q * qc[2]) / pq,
378    ];
379    let pa = sub(pc, a_center); // P − A
380    let wp = sub(w, pc); // W − P
381    let qcen = sub(qc, c_center); // Q − C
382    let wq = sub(w, qc); // W − Q
383
384    // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
385    // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
386    // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
387    // source and destination across the distinct slots `k`, `k−1`, `k−2`.
388    let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
389        (k % 3) * maxlevel + lf * slab[k] + eoff[k][ae] + m
390    };
391
392    // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
393    // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
394    // bound; an ERI couples both electrons).
395    use std::f64::consts::PI;
396    let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
397    let mut fm = [0.0f64; 4 * MAX_L + 1];
398    boys_array(l_total, t, &mut fm[..=l_total]);
399    for m in 0..=l_total {
400        levels[off(0, 0, 0, m)] = pref * fm[m];
401    }
402
403    let inv_2p = 0.5 / p;
404    let inv_2q = 0.5 / q;
405    let inv_2pq = 0.5 / pq;
406    let q_over_pq = q / pq;
407    let p_over_pq = p / pq;
408
409    // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
410    // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
411    // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
412    let eoff0 = &eoff[0];
413    for (na, te_list) in tri_e.iter().enumerate().take(ne + 1).skip(1) {
414        for &te in te_list {
415            let i = lower_axis(te);
416            let s1a = addr(dec(te, i));
417            let mmax = l_total - na;
418            let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
419            let has2 = te[i] >= 2;
420            let s1_0 = eoff0[s1a];
421            let s2_0 = if has2 {
422                eoff0[addr(dec(dec(te, i), i))]
423            } else {
424                0
425            };
426            let dst_0 = eoff0[addr(te)];
427            for m in 0..=mmax {
428                let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
429                if has2 {
430                    v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
431                }
432                levels[dst_0 + m] = v;
433            }
434        }
435    }
436    // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
437    for ae in 0..n_e {
438        c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
439    }
440
441    // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
442    // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
443    // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
444    // the former full-table inner-loop cost. The k−2 level is read only when the axis
445    // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
446    for k in 1..=nf {
447        let slot_k = (k % 3) * maxlevel;
448        let slot_k1 = ((k - 1) % 3) * maxlevel;
449        let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
450        let eoff_k = &eoff[k];
451        let eoff_k1 = &eoff[k - 1];
452        let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
453            (((k - 2) % 3) * maxlevel, slab[k - 2], &eoff[k - 2])
454        } else {
455            (0, 0, &[])
456        };
457        for &tf in &tri_f[k] {
458            let j = lower_axis(tf);
459            let f1 = dec(tf, j);
460            let lf1 = cart_index(f1); // local index in level k−1
461            let coef2 = ((tf[j] - 1) as f64) * inv_2q;
462            let has2 = tf[j] >= 2;
463            let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
464            let lf = cart_index(tf); // local index in level k
465            let sk = slot_k + lf * slab_k;
466            let sk1 = slot_k1 + lf1 * slab_k1;
467            let sk2 = slot_k2 + lf2 * slab_k2;
468            for (nadeg, te_list) in tri_e.iter().enumerate().take(ne + 1) {
469                for &te in te_list {
470                    let ea = addr(te);
471                    // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
472                    let has_cross = te[j] >= 1;
473                    let (cross_coef, cross_0) = if has_cross {
474                        (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
475                    } else {
476                        (0.0, 0)
477                    };
478                    let mmax = l_total - nadeg - k;
479                    let dst_0 = sk + eoff_k[ea];
480                    let src1_0 = sk1 + eoff_k1[ea];
481                    let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
482                    for m in 0..=mmax {
483                        let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
484                        if has2 {
485                            v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
486                        }
487                        if has_cross {
488                            v += cross_coef * levels[cross_0 + m + 1];
489                        }
490                        levels[dst_0 + m] = v;
491                    }
492                }
493            }
494        }
495        // Extract level k: contract each f-component's m = 0 slice into c_ef.
496        for &tf in &tri_f[k] {
497            let lf = cart_index(tf);
498            let af = addr(tf);
499            for ae in 0..n_e {
500                c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
501            }
502        }
503    }
504}
505
506/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
507/// the row-major output block.
508///
509/// Flat-array HGP horizontal recurrence, replacing the earlier
510/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
511/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
512/// term order — so it is **bit-identical** to the recursive version (guarded by
513/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
514/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
515///
516/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
517/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
518/// independently per ket index `f` (a spectator), into the `bra` intermediate.
519/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
520/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
521/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
522/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
523#[allow(clippy::too_many_arguments)]
524fn hrr_and_scatter<'a>(
525    la: usize,
526    lb: usize,
527    lc: usize,
528    ld: usize,
529    n_f: usize,
530    c_ef: &[f64],
531    ab: Vec3,
532    cd: Vec3,
533    out: &mut [f64],
534    bra: &mut Vec<f64>,
535    mut prev: &'a mut Vec<f64>,
536    mut cur: &'a mut Vec<f64>,
537    mut kprev: &'a mut Vec<f64>,
538    mut kcur: &'a mut Vec<f64>,
539) {
540    let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
541    let ne = la + lb; // max bra (A-side) degree present in c_ef
542    let nf = lc + ld; // max ket (C-side) degree present in c_ef
543    let n_e = n_addr(ne);
544
545    // Cartesian triples by degree, reused across the recurrences.
546    let tri: Vec<Vec<[usize; 3]>> = (0..=ne.max(nf)).map(cart_components).collect();
547
548    // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
549    // the ket base are keyed by the global `addr(f)` offset by `f_base`.
550    let f_base = tri_below(lc);
551    let nf_range = n_f - f_base;
552
553    // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
554    // Reused arena buffers; bra is fully overwritten below, the rolling
555    // layers in the region read — debug NaN-fill bra so a missed write surfaces.
556    let bra_len = na * nb * nf_range;
557    ensure_len(bra, bra_len);
558    #[cfg(debug_assertions)]
559    bra[..bra_len].fill(f64::NAN);
560    // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
561    let layer_len = n_cart(lb) * n_e;
562    ensure_len(prev, layer_len);
563    ensure_len(cur, layer_len);
564    for f_global in f_base..n_f {
565        let jf = f_global - f_base;
566        // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
567        for &a in tri[la..=ne].iter().flatten() {
568            let ae = addr(a);
569            prev[ae] = c_ef[ae * n_f + f_global];
570        }
571        for kb in 1..=lb {
572            for (ibw, &b) in tri[kb].iter().enumerate() {
573                let i = lower_axis(b);
574                let b1w = cart_index(dec(b, i));
575                // a-degrees that can still reach (la, lb): [la..=ne−kb].
576                for &a in tri[la..=(ne - kb)].iter().flatten() {
577                    let ae = addr(a);
578                    let a1e = addr(inc(a, i));
579                    cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
580                }
581            }
582            std::mem::swap(&mut prev, &mut cur);
583        }
584        // `prev` now holds b-degree lb (or the base when lb = 0): extract.
585        for (ib, &b) in tri[lb].iter().enumerate() {
586            let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
587            for (ia, &a) in tri[la].iter().enumerate() {
588                bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
589            }
590        }
591    }
592
593    // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
594    let klayer_len = n_cart(ld) * n_f;
595    ensure_len(kprev, klayer_len);
596    ensure_len(kcur, klayer_len);
597    for ia in 0..na {
598        for ib in 0..nb {
599            let brarow = (ia * nb + ib) * nf_range;
600            // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
601            for &c in tri[lc..=nf].iter().flatten() {
602                let ce = addr(c);
603                kprev[ce] = bra[brarow + (ce - f_base)];
604            }
605            for kd in 1..=ld {
606                for (idw, &d) in tri[kd].iter().enumerate() {
607                    let j = lower_axis(d);
608                    let d1w = cart_index(dec(d, j));
609                    for &c in tri[lc..=(nf - kd)].iter().flatten() {
610                        let ce = addr(c);
611                        let c1e = addr(inc(c, j));
612                        kcur[idw * n_f + ce] =
613                            kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
614                    }
615                }
616                std::mem::swap(&mut kprev, &mut kcur);
617            }
618            // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
619            for (ic, &c) in tri[lc].iter().enumerate() {
620                let ce = addr(c);
621                for id in 0..nd {
622                    out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
623                }
624            }
625        }
626    }
627}
628
629// --- small helpers ---
630
631/// First nonzero axis of a triple (the lowering direction).
632#[inline]
633fn lower_axis(t: [usize; 3]) -> usize {
634    if t[0] > 0 {
635        0
636    } else if t[1] > 0 {
637        1
638    } else {
639        2
640    }
641}
642
643#[inline]
644fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
645    t[i] -= 1;
646    t
647}
648
649#[inline]
650fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
651    t[i] += 1;
652    t
653}
654
655#[inline]
656fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
657    [
658        (a * ca[0] + b * cb[0]) / p,
659        (a * ca[1] + b * cb[1]) / p,
660        (a * ca[2] + b * cb[2]) / p,
661    ]
662}
663
664#[inline]
665fn sub(u: Vec3, v: Vec3) -> Vec3 {
666    [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
667}
668
669#[inline]
670fn dist2(u: Vec3, v: Vec3) -> f64 {
671    norm2(sub(u, v))
672}
673
674#[inline]
675fn norm2(u: Vec3) -> f64 {
676    u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
677}
678
679#[cfg(test)]
680mod tests {
681    use super::*;
682
683    /// Single-primitive `ShellRef` helper.
684    fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
685        (center, l, [exp], [1.0])
686    }
687
688    /// (ss|ss) over four unit s primitives must equal the closed form
689    /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
690    /// engine passes, so the two share a verified base case.
691    #[test]
692    fn ssss_matches_closed_form() {
693        let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
694        let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
695        let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
696        let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
697        let mut out = [0.0; 1];
698        coulomb_shell_into(
699            ShellRef {
700                center: ac,
701                l: al,
702                exps: &ae,
703                coeffs: &acf,
704            },
705            ShellRef {
706                center: bc,
707                l: bl,
708                exps: &be,
709                coeffs: &bcf,
710            },
711            ShellRef {
712                center: cc,
713                l: cl,
714                exps: &ce,
715                coeffs: &ccf,
716            },
717            ShellRef {
718                center: dc,
719                l: dl,
720                exps: &de,
721                coeffs: &dcf,
722            },
723            &mut out,
724        );
725
726        let p = 0.8 + 1.3;
727        let q = 0.5 + 1.1;
728        let pcen = combine(0.8, ac, 1.3, bc, p);
729        let qcen = combine(0.5, cc, 1.1, dc, q);
730        let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
731        let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
732        let rho = p * q / (p + q);
733        let t = rho * dist2(pcen, qcen);
734        let mut fm = [0.0; 1];
735        boys_array(0, t, &mut fm);
736        use std::f64::consts::PI;
737        let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
738        assert!(
739            (out[0] - expect).abs() < 1e-14 * expect.abs(),
740            "ssss {} vs {}",
741            out[0],
742            expect
743        );
744    }
745
746    use crate::os::Prim;
747    use crate::rys::coulomb_into;
748
749    /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
750    /// center)`, for cross-checking against the Rys engine.
751    #[allow(clippy::too_many_arguments)]
752    fn os_block(
753        la: usize,
754        ea: f64,
755        ca: Vec3,
756        lb: usize,
757        eb: f64,
758        cb: Vec3,
759        lc: usize,
760        recc: f64,
761        ccc: Vec3,
762        ld: usize,
763        ed: f64,
764        cdd: Vec3,
765    ) -> Vec<f64> {
766        let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
767        let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
768        let one = [1.0];
769        coulomb_shell_into(
770            ShellRef {
771                center: ca,
772                l: la,
773                exps: &ea1,
774                coeffs: &one,
775            },
776            ShellRef {
777                center: cb,
778                l: lb,
779                exps: &eb1,
780                coeffs: &one,
781            },
782            ShellRef {
783                center: ccc,
784                l: lc,
785                exps: &ec1,
786                coeffs: &one,
787            },
788            ShellRef {
789                center: cdd,
790                l: ld,
791                exps: &ed1,
792                coeffs: &one,
793            },
794            &mut out,
795        );
796        out
797    }
798
799    /// OS/HGP must reproduce the Rys engine element-for-element on a single
800    /// primitive quartet, across a sweep of angular momenta (including the
801    /// bug-prone mixed-high-L and "all four different L on four centers" cases).
802    #[test]
803    fn matches_rys_engine_primitive_sweep() {
804        let ca = [0.0, 0.0, 0.0];
805        let cb = [0.5, -0.3, 0.2];
806        let cc = [-0.4, 0.6, -0.1];
807        let cd = [0.2, 0.4, 0.8];
808        let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);
809
810        let quartets = [
811            (0usize, 0usize, 0usize, 0usize),
812            (1, 0, 0, 0),
813            (0, 0, 1, 0),
814            (1, 1, 0, 0),
815            (1, 0, 1, 0),
816            (1, 1, 1, 1),
817            (2, 0, 0, 0),
818            (2, 1, 0, 0),
819            (2, 1, 2, 1),
820            (0, 1, 2, 3), // four different L on four centers
821            (2, 2, 3, 3), // (dd|ff) mixed high-L
822            (3, 0, 0, 1),
823            // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
824            // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
825            // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
826            (4, 4, 4, 1), // l_total = 13
827            (6, 6, 1, 0), // l_total = 13, two i-shells
828            (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
829        ];
830        for (la, lb, lc, ld) in quartets {
831            let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
832            let mut rys = vec![0.0; os.len()];
833            coulomb_into(
834                Prim::new(ea, ca, la),
835                Prim::new(eb, cb, lb),
836                Prim::new(ec, cc, lc),
837                Prim::new(ed, cd, ld),
838                1.0,
839                &mut rys,
840            );
841            assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
842        }
843    }
844
845    /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
846    /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
847    /// The atol floor absorbs benign near-cancellation on structurally tiny
848    /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
849    /// even though both engines are correct — their dominant elements agree to
850    /// ~1e-12); the rtol catches any real divergence.
851    fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
852        const ATOL: f64 = 1e-11;
853        const RTOL: f64 = 1e-10;
854        for (o, r) in os.iter().zip(rys.iter()) {
855            assert!(
856                (o - r).abs() <= ATOL + RTOL * r.abs(),
857                "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
858                (o - r).abs()
859            );
860        }
861    }
862
863    /// HGP early contraction (VRR per primitive, HRR once in contracted space)
864    /// must equal the per-primitive Rys sum for a genuinely **contracted**
865    /// quartet.
866    ///
867    /// Note (`DESIGN_NOTES.md` D6): doing the HRR after contraction is a *performance*
868    /// choice, **not** a correctness fork. The HRR operator is linear with
869    /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
870    /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
871    /// and post-contraction HRR give the *same* answer. This remains a valuable
872    /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
873    /// coefficient, or a wrong HRR sign would all fail it), but it does not
874    /// distinguish HRR *ordering*, because the two orders are algebraically equal.
875    #[test]
876    fn contracted_quartet_matches_rys_sum() {
877        // Two p shells and a d shell, each with multiple primitives, on distinct
878        // centres (so HRR shift vectors A−B, C−D are all non-trivial).
879        let ca = [0.0, 0.0, 0.0];
880        let cb = [0.6, -0.2, 0.1];
881        let cc = [-0.3, 0.5, -0.2];
882        let cd = [0.2, 0.3, 0.7];
883        let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
884        let ax = [1.4, 0.45];
885        let acf = [0.6, 0.5];
886        let bx = [0.9, 0.3];
887        let bcf = [0.55, 0.5];
888        let cx = [1.1, 0.4];
889        let ccf = [0.7, 0.4];
890        let dx = [0.8];
891        let dcf = [1.0];
892
893        let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
894        coulomb_shell_into(
895            ShellRef {
896                center: ca,
897                l: la,
898                exps: &ax,
899                coeffs: &acf,
900            },
901            ShellRef {
902                center: cb,
903                l: lb,
904                exps: &bx,
905                coeffs: &bcf,
906            },
907            ShellRef {
908                center: cc,
909                l: lc,
910                exps: &cx,
911                coeffs: &ccf,
912            },
913            ShellRef {
914                center: cd,
915                l: ld,
916                exps: &dx,
917                coeffs: &dcf,
918            },
919            &mut os,
920        );
921
922        // Reference: sum the Rys primitive engine over the quartet with the same
923        // effective coefficients.
924        let mut rys = vec![0.0; os.len()];
925        for (&ea, &wa) in ax.iter().zip(acf.iter()) {
926            for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
927                for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
928                    for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
929                        coulomb_into(
930                            Prim::new(ea, ca, la),
931                            Prim::new(eb, cb, lb),
932                            Prim::new(ec, cc, lc),
933                            Prim::new(ed, cd, ld),
934                            wa * wb * wc * wd,
935                            &mut rys,
936                        );
937                    }
938                }
939            }
940        }
941
942        for (o, r) in os.iter().zip(rys.iter()) {
943            assert!(
944                (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
945                "contracted OS vs Rys mismatch: {o} vs {r}"
946            );
947        }
948    }
949
950    /// The triple-address map must be a bijection onto `0..n_addr(L)`.
951    #[test]
952    fn addr_is_bijective() {
953        for lmax in 0..=6 {
954            let mut seen = vec![false; n_addr(lmax)];
955            for n in 0..=lmax {
956                for t in cart_components(n) {
957                    let a = addr(t);
958                    assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
959                    assert!(!seen[a], "addr collision at {t:?}");
960                    seen[a] = true;
961                }
962            }
963            assert!(
964                seen.iter().all(|&x| x),
965                "addr not surjective at lmax {lmax}"
966            );
967        }
968    }
969}