integral_core/os_eri.rs
1//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
2//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
3//!
4//! A **vertical recurrence (VRR)** builds the
5//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
6//! the primitives are **contracted** into AO space, and a **horizontal recurrence
7//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
8//! Doing the HRR after contraction is the HGP "early contraction" trick: the
9//! geometry-only HRR runs **once per shell quartet** instead of once per
10//! primitive quartet, which is the win at high contraction degree.
11//!
12//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
13//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
14//! against it (and against an independent McMurchie–Davidson path).
15//!
16//! ## Method
17//!
18//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
19//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
20//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
21//!
22//! ```text
23//! [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
24//! ```
25//!
26//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
27//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
28//!
29//! ```text
30//! [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
31//! + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
32//! + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
33//! [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
34//! + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
35//! + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
36//! ```
37//!
38//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
39//! Cartesian shell block:
40//!
41//! ```text
42//! (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0) [bra, A→B]
43//! (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d) [ket, C→D]
44//! ```
45//!
46//! ## Buffers
47//!
48//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
49//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
50//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
51//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
52//! `(ii|ii)`. The current engine removes that:
53//!
54//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
55//! `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
56//! of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
57//! index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
58//! `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
59//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
60//! / [`cart_index`] with two rolling degree layers, replacing the former HashMap
61//! memoisation.
62//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
63//! across quartets (thread-local by default), not re-allocated per quartet.
64//!
65//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
66//! former full-table engine's values, cross-checked against the Rys engine and an
67//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).
68
69use std::cell::RefCell;
70
71use integral_math::am::{cart_components, cart_index, n_cart};
72use integral_math::boys::boys_array;
73
74use crate::os::{Vec3, MAX_L};
75
76/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
77/// momentum, and primitive `(exponent, effective-coefficient)` data. The
78/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
79/// engine itself works on un-normalised monomials and only multiplies the four
80/// coefficients into the contracted accumulator.
81#[derive(Debug, Clone, Copy)]
82pub struct ShellRef<'a> {
83 /// Shell centre (bohr).
84 pub center: Vec3,
85 /// Angular momentum.
86 pub l: usize,
87 /// Primitive exponents.
88 pub exps: &'a [f64],
89 /// Effective contraction coefficients, aligned with `exps`.
90 pub coeffs: &'a [f64],
91}
92
93/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
94#[inline]
95fn n_addr(lmax: usize) -> usize {
96 (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
97}
98
99/// Number of Cartesian triples of degree **strictly less than** `d`:
100/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
101/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
102#[inline]
103fn tri_below(d: usize) -> usize {
104 d * (d + 1) * (d + 2) / 6
105}
106
107/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
108/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
109/// `0..n_addr(deg)`.
110#[inline]
111fn addr(t: [usize; 3]) -> usize {
112 let n = t[0] + t[1] + t[2];
113 let base = n * (n + 1) * (n + 2) / 6;
114 // within-degree index: (n-lx)(n-lx+1)/2 + lz
115 let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
116 base + within
117}
118
119/// Reusable scratch arena for the OS/HGP ERI engine.
120///
121/// Holds the engine's working buffers so they are allocated **once and reused**
122/// across shell quartets, instead of a fresh heap allocation per quartet inside the
123/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
124/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
125/// thread uses its own instance — the thread-local default behind
126/// [`coulomb_shell_into`], or one passed explicitly to
127/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
128/// error).
129///
130/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
131/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
132/// they are later read, so they need no functional zeroing — but in debug builds
133/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
134/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
135/// rather than silently reading a stale value from a previous quartet. Results are
136/// therefore independent of evaluation order and of arena reuse (asserted by
137/// `tests/arena.rs`).
138#[derive(Debug, Default, Clone)]
139pub struct EriScratch {
140 /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
141 levels: Vec<f64>,
142 /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
143 c_ef: Vec<f64>,
144 /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
145 bra: Vec<f64>,
146 /// HRR bra rolling `b`-degree layers.
147 bra_prev: Vec<f64>,
148 bra_cur: Vec<f64>,
149 /// HRR ket rolling `d`-degree layers.
150 ket_prev: Vec<f64>,
151 ket_cur: Vec<f64>,
152 /// Precomputed bra/ket primitive-pair data (combined exponent, product
153 /// centre, `K` prefactor, coeff product) — built once per shell quartet so
154 /// the per-pair `exp`/centre work is not repeated inside the deep primitive
155 /// loop. Reused across quartets like the other buffers.
156 bra_pairs: Vec<PrimPair>,
157 ket_pairs: Vec<PrimPair>,
158 /// Shared Cartesian-triple table `tri_all[d] = cart_components(d)` for every
159 /// degree the engine needs (`0..=2·MAX_L`). Built once and sliced for the VRR
160 /// `e`/`f` lists and the HRR triples, instead of re-`collect`ing per quartet.
161 tri_all: Vec<Vec<[usize; 3]>>,
162 /// Flat VRR offset table: `eoff[k·n_e + ae]` is `e`'s packed offset in level
163 /// `k`. Rebuilt per quartet into this reused buffer (no per-quartet `Vec<Vec>`
164 /// allocation); `slab[k]` is one `f`-component's packed size for level `k`.
165 eoff: Vec<usize>,
166 slab: Vec<usize>,
167}
168
169/// One primitive pair's reusable geometry/prefactor, shared by every primitive
170/// quartet that uses it: the combined exponent `zeta = α+β`, the Gaussian product
171/// centre, the overlap prefactor `K = exp(−αβ/ζ·|A−B|²)`, and the two contraction
172/// coefficients (`c1` outer, `c2` inner).
173///
174/// The coefficients are kept **un-multiplied** so the contracted `scale` can be
175/// formed in exactly the former evaluation order `((c_a·c_b)·c_c)·c_d` — making the
176/// engine's output bit-identical to the pre-precomputation code (the per-element
177/// and 8-fold-symmetry tests are floorless-relative and sensitive to the last ULP
178/// on near-cancellation elements).
179#[derive(Debug, Clone, Copy, Default)]
180struct PrimPair {
181 zeta: f64,
182 center: Vec3,
183 kappa: f64,
184 c1: f64,
185 c2: f64,
186 /// `1/(2ζ)` — the VRR `inv_2p`(bra)/`inv_2q`(ket) coefficient. Pure per-pair.
187 inv_2zeta: f64,
188 /// `centre − s1.centre` — the VRR `P−A`(bra)/`Q−C`(ket) shift. Pure per-pair.
189 r1: Vec3,
190}
191
192/// A primitive pair whose weighted overlap prefactor `K·|c₁·c₂|` falls below
193/// this bound contributes at most ~`1e-24` (absolutely) to any tensor element —
194/// roughly twelve orders below the engine's `1e-12` accuracy bar — so it is
195/// dropped from the pair list before the quartet loop. Every `[e0|f0]^(m)` term
196/// the pair could produce is bounded by `K·|c₁·c₂|` times the other pair's
197/// weight (≤ its own bound), the `2π^{5/2}/(pq√(p+q))` prefactor, `F_m ≤ 1`,
198/// and the VRR's geometric raise factors; the product stays ≤ ~1e-24 for any
199/// chemically sensible exponents/geometry. The deeply contracted cross-centre
200/// pairs this removes have `K` underflowing to `0.0` outright in typical bases
201/// (tight×tight pairs at bonding distances reach `exp(-10⁴)`), where skipping
202/// is *exactly* lossless.
203const PAIR_NEGLIGIBLE: f64 = 1e-32;
204
205/// Fill `out` with the [`PrimPair`] data for every primitive combination of two
206/// shells (outer `s1`, inner `s2`), matching the former in-loop computation
207/// element-for-element. The inter-centre distance is hoisted out of the pair loop.
208/// `inv_2zeta` and `r1` are the pair-only VRR coefficients (`1/2ζ` and the `P−A` /
209/// `Q−C` shift) that the kernel formerly recomputed on every primitive quartet.
210/// Pairs below [`PAIR_NEGLIGIBLE`] are omitted; the surviving pairs keep their
211/// order, so the contraction accumulates the remaining terms in the same
212/// sequence as before.
213fn build_pairs(out: &mut Vec<PrimPair>, s1: ShellRef<'_>, s2: ShellRef<'_>) {
214 out.clear();
215 out.reserve(s1.exps.len() * s2.exps.len());
216 let d2 = dist2(s1.center, s2.center);
217 for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
218 for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
219 let zeta = e1 + e2;
220 let kappa = (-(e1 * e2 / zeta) * d2).exp();
221 if kappa * (c1 * c2).abs() < PAIR_NEGLIGIBLE {
222 continue;
223 }
224 let center = combine(e1, s1.center, e2, s2.center, zeta);
225 out.push(PrimPair {
226 zeta,
227 center,
228 kappa,
229 c1,
230 c2,
231 inv_2zeta: 0.5 / zeta,
232 r1: sub(center, s1.center),
233 });
234 }
235 }
236}
237
238impl EriScratch {
239 /// A fresh, empty arena; it grows to fit on first use.
240 #[must_use]
241 pub fn new() -> Self {
242 Self::default()
243 }
244
245 /// Total `f64` elements currently held across all buffers — the resident
246 /// working set, for memory reporting/tests.
247 #[must_use]
248 pub fn resident_f64(&self) -> usize {
249 self.levels.len()
250 + self.c_ef.len()
251 + self.bra.len()
252 + self.bra_prev.len()
253 + self.bra_cur.len()
254 + self.ket_prev.len()
255 + self.ket_cur.len()
256 }
257
258 /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
259 /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
260 /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
261 #[must_use]
262 pub fn largest_buffer_f64(&self) -> usize {
263 [
264 self.levels.len(),
265 self.c_ef.len(),
266 self.bra.len(),
267 self.bra_prev.len(),
268 self.bra_cur.len(),
269 self.ket_prev.len(),
270 self.ket_cur.len(),
271 ]
272 .into_iter()
273 .max()
274 .unwrap_or(0)
275 }
276}
277
278thread_local! {
279 /// Per-thread default arena backing [`coulomb_shell_into`].
280 static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
281}
282
283/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
284#[inline]
285fn ensure_len(v: &mut Vec<f64>, n: usize) {
286 if v.len() < n {
287 v.resize(n, 0.0);
288 }
289}
290
291/// `ensure_len` for the `usize` offset/slab buffers.
292#[inline]
293fn ensure_usize(v: &mut Vec<usize>, n: usize) {
294 if v.len() < n {
295 v.resize(n, 0);
296 }
297}
298
299/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
300/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
301/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
302///
303/// The engine owns the primitive-quartet loop (the HGP contraction happens between
304/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
305/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
306/// explicit control (e.g. one arena per worker thread) call
307/// [`coulomb_shell_into_scratch`].
308///
309/// # Panics
310/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
311pub fn coulomb_shell_into(
312 a: ShellRef<'_>,
313 b: ShellRef<'_>,
314 c: ShellRef<'_>,
315 d: ShellRef<'_>,
316 out: &mut [f64],
317) {
318 ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
319}
320
321/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
322/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
323/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
324/// compile error); the result is bit-identical regardless of which arena is used or
325/// what it last held.
326///
327/// # Panics
328/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
329pub fn coulomb_shell_into_scratch(
330 scratch: &mut EriScratch,
331 a: ShellRef<'_>,
332 b: ShellRef<'_>,
333 c: ShellRef<'_>,
334 d: ShellRef<'_>,
335 out: &mut [f64],
336) {
337 let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
338 debug_assert!(
339 la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
340 "angular momentum exceeds MAX_L"
341 );
342 let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
343 debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");
344
345 let ne = la + lb; // max bra (A-side) degree built by VRR
346 let nf = lc + ld; // max ket (C-side) degree built by VRR
347 let l_total = ne + nf;
348 let n_e = n_addr(ne);
349 let n_f = n_addr(nf);
350
351 let EriScratch {
352 levels,
353 c_ef,
354 bra,
355 bra_prev,
356 bra_cur,
357 ket_prev,
358 ket_cur,
359 bra_pairs,
360 ket_pairs,
361 tri_all,
362 eoff,
363 slab,
364 } = scratch;
365
366 // Fast path: (ss|ss). With no angular momentum to raise, the whole quartet is a
367 // single contracted [00|00]^(0). Skip the VRR/HRR machinery (the dead raise
368 // coefficients, the `levels` buffer round-trip, the per-primitive call) and sum
369 // straight into a register. Bit-identical to the general path: same `pref`/`t`
370 // expressions, same `((c_a·c_b)·c_c)·c_d` scale, same `scale·(pref·F_0)` term,
371 // same bra-outer/ket-inner order — just without the scaffolding.
372 if l_total == 0 {
373 build_pairs(bra_pairs, a, b);
374 build_pairs(ket_pairs, c, d);
375 let two_pi_2_5 =
376 2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
377 let mut acc = 0.0;
378 let mut f0 = [0.0f64; 1];
379 for bra in bra_pairs.iter() {
380 let bc = bra.c1 * bra.c2;
381 for ket in ket_pairs.iter() {
382 let (p, q) = (bra.zeta, ket.zeta);
383 let pq = p + q;
384 let t = (p * q / pq) * dist2(bra.center, ket.center);
385 let pref = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
386 boys_array(0, t, &mut f0);
387 acc += (bc * ket.c1) * ket.c2 * (pref * f0[0]);
388 }
389 }
390 out[0] += acc;
391 return;
392 }
393
394 // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
395 ensure_len(c_ef, n_e * n_f);
396 c_ef[..n_e * n_f].fill(0.0);
397
398 // Shared Cartesian-triple table, built once (degrees `0..=2·MAX_L` cover every
399 // `ne`/`nf`/`max(ne,nf)` the engine reaches) and sliced below — no per-quartet
400 // `cart_components` allocation.
401 if tri_all.len() <= 2 * MAX_L {
402 *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
403 }
404
405 // Precompute the bra/ket primitive-pair data once (the `exp` prefactor and
406 // product centre depend only on the pair, not the quartet), then loop over the
407 // pair lists. The previous code recomputed the ket pair's `exp`/centre on every
408 // bra primitive — `na·nb` times too many. Bra-outer / ket-inner preserves the
409 // exact accumulation order into `c_ef`.
410 build_pairs(bra_pairs, a, b);
411 build_pairs(ket_pairs, c, d);
412
413 // VRR. The small classes (`ne, nf ≤ 3` — every quartet shape of an
414 // spd basis except the d,d-bra/ket pairs, ~96% of a cc-pVDZ-style build)
415 // dispatch to the monomorphized kernels, where the whole structural walk
416 // (triples, addresses, branches, m-ranges) is resolved at compile time;
417 // everything else runs the general m-marching `vrr_primitive`.
418 // `(0,0)` already returned through the (ss|ss) fast path above.
419 match (ne, nf) {
420 (0, 1) => contract_class::<0, 1>(bra_pairs, ket_pairs, c_ef),
421 (1, 0) => contract_class::<1, 0>(bra_pairs, ket_pairs, c_ef),
422 (1, 1) => contract_class::<1, 1>(bra_pairs, ket_pairs, c_ef),
423 (0, 2) => contract_class::<0, 2>(bra_pairs, ket_pairs, c_ef),
424 (2, 0) => contract_class::<2, 0>(bra_pairs, ket_pairs, c_ef),
425 (1, 2) => contract_class::<1, 2>(bra_pairs, ket_pairs, c_ef),
426 (2, 1) => contract_class::<2, 1>(bra_pairs, ket_pairs, c_ef),
427 (2, 2) => contract_class::<2, 2>(bra_pairs, ket_pairs, c_ef),
428 (0, 3) => contract_class::<0, 3>(bra_pairs, ket_pairs, c_ef),
429 (3, 0) => contract_class::<3, 0>(bra_pairs, ket_pairs, c_ef),
430 (1, 3) => contract_class::<1, 3>(bra_pairs, ket_pairs, c_ef),
431 (3, 1) => contract_class::<3, 1>(bra_pairs, ket_pairs, c_ef),
432 (2, 3) => contract_class::<2, 3>(bra_pairs, ket_pairs, c_ef),
433 (3, 2) => contract_class::<3, 2>(bra_pairs, ket_pairs, c_ef),
434 (3, 3) => contract_class::<3, 3>(bra_pairs, ket_pairs, c_ef),
435 _ => {
436 // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
437 // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
438 // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
439 // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
440 // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
441 // packed size and `eoff[k·n_e + ae]` is `e`'s offset within it (flat, reused).
442 ensure_usize(slab, nf + 1);
443 ensure_usize(eoff, (nf + 1) * n_e);
444 // `k` indexes both `slab` and the flat `eoff` block base `k·n_e`, so a range
445 // loop is the clear form here.
446 #[allow(clippy::needless_range_loop)]
447 for k in 0..=nf {
448 let base = k * n_e;
449 let mut run = 0usize;
450 let mut ae = 0usize;
451 for de in 0..=ne {
452 let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
453 for _ in 0..n_cart(de) {
454 eoff[base + ae] = run;
455 ae += 1;
456 run += mlen;
457 }
458 }
459 slab[k] = run;
460 }
461 let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
462 ensure_len(levels, 3 * maxlevel);
463 // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
464 // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
465 // instead of silently using a previous quartet's value.
466 #[cfg(debug_assertions)]
467 levels[..3 * maxlevel].fill(f64::NAN);
468
469 // Reborrow the offset/triple buffers as shared slices for the read-only kernel.
470 let (eoff_s, slab_s, tri_s) = (&eoff[..], &slab[..], &tri_all[..]);
471 for bra in bra_pairs.iter() {
472 let bra_coef = bra.c1 * bra.c2; // = c_a·c_b, hoisted out of the ket loop
473 for ket in ket_pairs.iter() {
474 // ((c_a·c_b)·c_c)·c_d — the former left-to-right product, bit for bit.
475 let scale = (bra_coef * ket.c1) * ket.c2;
476 vrr_primitive(
477 bra.zeta,
478 ket.zeta,
479 bra.center,
480 ket.center,
481 bra.kappa,
482 ket.kappa,
483 bra.r1,
484 ket.r1,
485 bra.inv_2zeta,
486 ket.inv_2zeta,
487 ne,
488 nf,
489 l_total,
490 n_e,
491 n_f,
492 maxlevel,
493 eoff_s,
494 slab_s,
495 tri_s,
496 levels,
497 scale,
498 c_ef,
499 );
500 }
501 }
502 }
503 }
504
505 // HRR in contracted space, then scatter the block.
506 let ab = sub(a.center, b.center); // A − B
507 let cd = sub(c.center, d.center); // C − D
508 hrr_and_scatter(
509 la,
510 lb,
511 lc,
512 ld,
513 n_f,
514 c_ef,
515 ab,
516 cd,
517 out,
518 bra,
519 bra_prev,
520 bra_cur,
521 ket_prev,
522 ket_cur,
523 &tri_all[..],
524 );
525}
526
527/// Cartesian triples of degree 1, 2 and 3 in [`cart_components`] order, as `const`
528/// tables so the monomorphized small-class VRR kernels fully unroll over them
529/// (the structural walk — axes, addresses, `has2`/cross branches — then resolves
530/// at compile time instead of being re-derived on every primitive quartet).
531const TRI1: [[usize; 3]; 3] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]];
532const TRI2: [[usize; 3]; 6] = [
533 [2, 0, 0],
534 [1, 1, 0],
535 [1, 0, 1],
536 [0, 2, 0],
537 [0, 1, 1],
538 [0, 0, 2],
539];
540const TRI3: [[usize; 3]; 10] = [
541 [3, 0, 0],
542 [2, 1, 0],
543 [2, 0, 1],
544 [1, 2, 0],
545 [1, 1, 1],
546 [1, 0, 2],
547 [0, 3, 0],
548 [0, 2, 1],
549 [0, 1, 2],
550 [0, 0, 3],
551];
552/// The e-components `0..n_addr(3)` in [`addr`] order with their degree —
553/// the bra index set of the small-class kernels (`NE ≤ 3`).
554const E_COMP: [([usize; 3], usize); 20] = [
555 ([0, 0, 0], 0),
556 ([1, 0, 0], 1),
557 ([0, 1, 0], 1),
558 ([0, 0, 1], 1),
559 ([2, 0, 0], 2),
560 ([1, 1, 0], 2),
561 ([1, 0, 1], 2),
562 ([0, 2, 0], 2),
563 ([0, 1, 1], 2),
564 ([0, 0, 2], 2),
565 ([3, 0, 0], 3),
566 ([2, 1, 0], 3),
567 ([2, 0, 1], 3),
568 ([1, 2, 0], 3),
569 ([1, 1, 1], 3),
570 ([1, 0, 2], 3),
571 ([0, 3, 0], 3),
572 ([0, 2, 1], 3),
573 ([0, 1, 2], 3),
574 ([0, 0, 3], 3),
575];
576
577/// Contract a whole shell quartet of VRR shape `(NE, NF)` (with `NE, NF ≤ 2`)
578/// into `c_ef` using the monomorphized small-class kernel [`vrr_small`].
579///
580/// Same bra-outer / ket-inner primitive loop and the same left-to-right
581/// `((c_a·c_b)·c_c)·c_d` scale as the general path, so the accumulation into
582/// `c_ef` is bit-identical. The VRR working tables live here (overwritten per
583/// primitive quartet in the region read, like the general path's `levels`
584/// arena) so their one-time zero-init is per shell quartet, not per primitive.
585fn contract_class<const NE: usize, const NF: usize>(
586 bra_pairs: &[PrimPair],
587 ket_pairs: &[PrimPair],
588 c_ef: &mut [f64],
589) {
590 // Worst-case (NE,NF)=(3,3) sizes: e-table 20 components × m ∈ 0..=6;
591 // f-degree-1 level 20×3 × m ∈ 0..=5; f-degree-2 level 20×6 × m ∈ 0..=4;
592 // f-degree-3 level 20×10 × m ∈ 0..=3.
593 let mut ea = [0.0f64; 140];
594 let mut eb1 = [0.0f64; 360];
595 let mut eb2 = [0.0f64; 600];
596 let mut eb3 = [0.0f64; 800];
597 for bra in bra_pairs {
598 let bra_coef = bra.c1 * bra.c2;
599 for ket in ket_pairs {
600 let scale = (bra_coef * ket.c1) * ket.c2;
601 vrr_small::<NE, NF>(bra, ket, scale, &mut ea, &mut eb1, &mut eb2, &mut eb3, c_ef);
602 }
603 }
604}
605
606/// One primitive quartet of the OS VRR for the small classes `NE, NF ≤ 3`,
607/// monomorphized per `(NE, NF)` — a per-L-class specialized kernel expressed
608/// through const generics: every loop bound, Cartesian triple, address, stride,
609/// and `has2`/cross branch below is a compile-time constant in each
610/// instantiation, so the emitted code is a straight unrolled FMA chain.
611///
612/// **Bit-identical to [`vrr_primitive`] by construction**: the same recurrence
613/// expressions with the same term order and association — base
614/// `pref·F_m`, raise `pa_i·s1[m] + wp_i·s1[m+1]`, then `+= coef2·(s2[m] −
615/// (q/pq)·s2[m+1])`, then `+= cross·s3[m+1]` — the same m-ranges
616/// (`m ≤ l_total − de − k`), and the same `c_ef[ae·n_f + af] += scale·v`
617/// extraction. Only the storage differs (fixed per-degree tables instead of the
618/// triangle-packed rolling window), which does not touch the arithmetic.
619/// Guarded by the full-tensor XOR fingerprint and the golden/cross-engine tests.
620///
621/// Per-degree row strides: the m-rows of `ea` hold `m ∈ 0..=lt` (`sa = lt+1`),
622/// of `eb1` `m ∈ 0..=lt−1` (`s1 = lt`), of `eb2` `m ∈ 0..=lt−2`, of `eb3`
623/// `m ∈ 0..=lt−3` — each level's longest row (its degree-0 e-component).
624#[inline(always)]
625#[allow(clippy::too_many_arguments)]
626fn vrr_small<const NE: usize, const NF: usize>(
627 bra: &PrimPair,
628 ket: &PrimPair,
629 scale: f64,
630 ea: &mut [f64; 140],
631 eb1: &mut [f64; 360],
632 eb2: &mut [f64; 600],
633 eb3: &mut [f64; 800],
634 c_ef: &mut [f64],
635) {
636 let lt = NE + NF;
637 let n_e = n_addr(NE);
638 let n_f = n_addr(NF);
639 // Row strides (compile-time constants per instantiation).
640 let sa = lt + 1;
641 let s1 = lt;
642 let s2 = lt.saturating_sub(1);
643 let s3 = lt.saturating_sub(2);
644 let (p, q) = (bra.zeta, ket.zeta);
645 let (pc, qc) = (bra.center, ket.center);
646 let pa = bra.r1; // P − A
647 let qcen = ket.r1; // Q − C
648 let (inv_2p, inv_2q) = (bra.inv_2zeta, ket.inv_2zeta);
649
650 // Scalar header — identical expressions to `vrr_primitive`.
651 let pq = p + q;
652 let rho = p * q / pq;
653 let pq_vec = sub(pc, qc); // P − Q
654 let t = rho * norm2(pq_vec);
655 let w = [
656 (p * pc[0] + q * qc[0]) / pq,
657 (p * pc[1] + q * qc[1]) / pq,
658 (p * pc[2] + q * qc[2]) / pq,
659 ];
660 let wp = sub(w, pc); // W − P
661 let wq = sub(w, qc); // W − Q
662 use std::f64::consts::PI;
663 let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
664 let mut fm = [0.0f64; 7];
665 boys_array(lt, t, &mut fm[..=lt]);
666 for m in 0..=lt {
667 ea[m] = pref * fm[m];
668 }
669
670 let inv_2pq = 0.5 / pq;
671 let q_over_pq = q / pq;
672 let p_over_pq = p / pq;
673
674 // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
675 if NE >= 1 {
676 for (iw, te) in TRI1.iter().enumerate() {
677 let i = lower_axis(*te);
678 // Source is [0,0,0] at addr 0; no has2 term at degree 1.
679 for m in 0..=(lt - 1) {
680 ea[(1 + iw) * sa + m] = pa[i] * ea[m] + wp[i] * ea[m + 1];
681 }
682 }
683 }
684 if NE >= 2 {
685 for (iw, te) in TRI2.iter().enumerate() {
686 let i = lower_axis(*te);
687 let s1a = addr(dec(*te, i));
688 let has2 = te[i] >= 2;
689 let coef2 = ((te[i] - 1) as f64) * inv_2p;
690 let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
691 for m in 0..=(lt - 2) {
692 let mut v = pa[i] * ea[s1a * sa + m] + wp[i] * ea[s1a * sa + m + 1];
693 if has2 {
694 v += coef2 * (ea[s2a * sa + m] - q_over_pq * ea[s2a * sa + m + 1]);
695 }
696 ea[(4 + iw) * sa + m] = v;
697 }
698 }
699 }
700 if NE >= 3 {
701 for (iw, te) in TRI3.iter().enumerate() {
702 let i = lower_axis(*te);
703 let s1a = addr(dec(*te, i));
704 let has2 = te[i] >= 2;
705 let coef2 = ((te[i] - 1) as f64) * inv_2p;
706 let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
707 for m in 0..=(lt - 3) {
708 let mut v = pa[i] * ea[s1a * sa + m] + wp[i] * ea[s1a * sa + m + 1];
709 if has2 {
710 v += coef2 * (ea[s2a * sa + m] - q_over_pq * ea[s2a * sa + m + 1]);
711 }
712 ea[(10 + iw) * sa + m] = v;
713 }
714 }
715 }
716 // Extract f-degree 0: contract m = 0 of every e-component.
717 for ae in 0..n_e {
718 c_ef[ae * n_f] += scale * ea[ae * sa];
719 }
720
721 // Phase B — ket raise, f-degree 1: m ≤ lt − de − 1.
722 if NF >= 1 {
723 for (fw, tf) in TRI1.iter().enumerate() {
724 let j = lower_axis(*tf);
725 // Source f-component is [0,0,0] (level 0 = the phase-A table).
726 for ae in 0..n_e {
727 let (te, de) = E_COMP[ae];
728 let has_cross = te[j] >= 1;
729 let cross_coef = te[j] as f64 * inv_2pq;
730 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
731 for m in 0..=(lt - de - 1) {
732 let mut v = qcen[j] * ea[ae * sa + m] + wq[j] * ea[ae * sa + m + 1];
733 if has_cross {
734 v += cross_coef * ea[cs * sa + m + 1];
735 }
736 eb1[(ae * 3 + fw) * s1 + m] = v;
737 }
738 }
739 }
740 for ae in 0..n_e {
741 for fw in 0..3 {
742 c_ef[ae * n_f + 1 + fw] += scale * eb1[(ae * 3 + fw) * s1];
743 }
744 }
745 }
746
747 // Phase B — ket raise, f-degree 2: m ≤ lt − de − 2.
748 if NF >= 2 {
749 for (fw, tf) in TRI2.iter().enumerate() {
750 let j = lower_axis(*tf);
751 let f1 = dec(*tf, j);
752 let lf1 = cart_index(f1); // f-component index in the degree-1 level
753 let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
754 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
755 for ae in 0..n_e {
756 let (te, de) = E_COMP[ae];
757 let has_cross = te[j] >= 1;
758 let cross_coef = te[j] as f64 * inv_2pq;
759 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
760 for m in 0..=(lt - de - 2) {
761 let mut v = qcen[j] * eb1[(ae * 3 + lf1) * s1 + m]
762 + wq[j] * eb1[(ae * 3 + lf1) * s1 + m + 1];
763 if has2 {
764 v += coef2 * (ea[ae * sa + m] - p_over_pq * ea[ae * sa + m + 1]);
765 }
766 if has_cross {
767 v += cross_coef * eb1[(cs * 3 + lf1) * s1 + m + 1];
768 }
769 eb2[(ae * 6 + fw) * s2 + m] = v;
770 }
771 }
772 }
773 for ae in 0..n_e {
774 for fw in 0..6 {
775 c_ef[ae * n_f + 4 + fw] += scale * eb2[(ae * 6 + fw) * s2];
776 }
777 }
778 }
779
780 // Phase B — ket raise, f-degree 3: m ≤ lt − de − 3.
781 if NF >= 3 {
782 for (fw, tf) in TRI3.iter().enumerate() {
783 let j = lower_axis(*tf);
784 let f1 = dec(*tf, j);
785 let lf1 = cart_index(f1); // f-component index in the degree-2 level
786 let has2 = tf[j] >= 2; // its k−2 source is f1 lowered again (degree 1)
787 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
788 let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 };
789 for ae in 0..n_e {
790 let (te, de) = E_COMP[ae];
791 let has_cross = te[j] >= 1;
792 let cross_coef = te[j] as f64 * inv_2pq;
793 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
794 for m in 0..=(lt - de - 3) {
795 let mut v = qcen[j] * eb2[(ae * 6 + lf1) * s2 + m]
796 + wq[j] * eb2[(ae * 6 + lf1) * s2 + m + 1];
797 if has2 {
798 v += coef2
799 * (eb1[(ae * 3 + lf2) * s1 + m]
800 - p_over_pq * eb1[(ae * 3 + lf2) * s1 + m + 1]);
801 }
802 if has_cross {
803 v += cross_coef * eb2[(cs * 6 + lf1) * s2 + m + 1];
804 }
805 eb3[(ae * 10 + fw) * s3 + m] = v;
806 }
807 }
808 }
809 for ae in 0..n_e {
810 for fw in 0..10 {
811 c_ef[ae * n_f + 10 + fw] += scale * eb3[(ae * 10 + fw) * s3];
812 }
813 }
814 }
815}
816
817/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
818/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
819///
820/// Algebraically identical to the former full-table VRR (same OS recurrences, same
821/// term order) — only the storage changes, so it is bit-identical
822/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
823/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
824/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
825/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
826/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
827/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
828/// buffer is recycled.
829#[allow(clippy::too_many_arguments)]
830fn vrr_primitive(
831 p: f64,
832 q: f64,
833 pc: Vec3,
834 qc: Vec3,
835 kab: f64,
836 kcd: f64,
837 pa: Vec3, // P − A (bra pair, precomputed)
838 qcen: Vec3, // Q − C (ket pair, precomputed)
839 inv_2p: f64, // 1/(2p) (bra pair, precomputed)
840 inv_2q: f64, // 1/(2q) (ket pair, precomputed)
841 ne: usize,
842 nf: usize,
843 l_total: usize,
844 n_e: usize,
845 n_f: usize,
846 maxlevel: usize,
847 eoff: &[usize],
848 slab: &[usize],
849 tri: &[Vec<[usize; 3]>],
850 levels: &mut [f64],
851 scale: f64,
852 c_ef: &mut [f64],
853) {
854 let pq = p + q;
855 let rho = p * q / pq;
856 let pq_vec = sub(pc, qc); // P − Q
857 let t = rho * norm2(pq_vec);
858 let w = [
859 (p * pc[0] + q * qc[0]) / pq,
860 (p * pc[1] + q * qc[1]) / pq,
861 (p * pc[2] + q * qc[2]) / pq,
862 ];
863 // `pa` (P−A) and `qcen` (Q−C) are now passed in (pure per-pair).
864 let wp = sub(w, pc); // W − P
865 let wq = sub(w, qc); // W − Q
866
867 // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
868 // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
869 // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
870 // source and destination across the distinct slots `k`, `k−1`, `k−2`.
871 let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
872 (k % 3) * maxlevel + lf * slab[k] + eoff[k * n_e + ae] + m
873 };
874
875 // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
876 // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
877 // bound; an ERI couples both electrons).
878 use std::f64::consts::PI;
879 let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
880 let mut fm = [0.0f64; 4 * MAX_L + 1];
881 boys_array(l_total, t, &mut fm[..=l_total]);
882 for m in 0..=l_total {
883 levels[off(0, 0, 0, m)] = pref * fm[m];
884 }
885
886 // `inv_2p` / `inv_2q` are now passed in (pure per-pair).
887 let inv_2pq = 0.5 / pq;
888 let q_over_pq = q / pq;
889 let p_over_pq = p / pq;
890
891 // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
892 // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
893 // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
894 let eoff0 = &eoff[..n_e];
895 for (na, te_list) in tri.iter().enumerate().take(ne + 1).skip(1) {
896 for &te in te_list {
897 let i = lower_axis(te);
898 let s1a = addr(dec(te, i));
899 let mmax = l_total - na;
900 let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
901 let has2 = te[i] >= 2;
902 let s1_0 = eoff0[s1a];
903 let s2_0 = if has2 {
904 eoff0[addr(dec(dec(te, i), i))]
905 } else {
906 0
907 };
908 let dst_0 = eoff0[addr(te)];
909 for m in 0..=mmax {
910 let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
911 if has2 {
912 v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
913 }
914 levels[dst_0 + m] = v;
915 }
916 }
917 }
918 // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
919 for ae in 0..n_e {
920 c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
921 }
922
923 // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
924 // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
925 // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
926 // the former full-table inner-loop cost. The k−2 level is read only when the axis
927 // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
928 for k in 1..=nf {
929 let slot_k = (k % 3) * maxlevel;
930 let slot_k1 = ((k - 1) % 3) * maxlevel;
931 let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
932 let eoff_k = &eoff[k * n_e..];
933 let eoff_k1 = &eoff[(k - 1) * n_e..];
934 let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
935 (
936 ((k - 2) % 3) * maxlevel,
937 slab[k - 2],
938 &eoff[(k - 2) * n_e..],
939 )
940 } else {
941 (0, 0, &[])
942 };
943 for &tf in &tri[k] {
944 let j = lower_axis(tf);
945 let f1 = dec(tf, j);
946 let lf1 = cart_index(f1); // local index in level k−1
947 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
948 let has2 = tf[j] >= 2;
949 let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
950 let lf = cart_index(tf); // local index in level k
951 let sk = slot_k + lf * slab_k;
952 let sk1 = slot_k1 + lf1 * slab_k1;
953 let sk2 = slot_k2 + lf2 * slab_k2;
954 for (nadeg, te_list) in tri.iter().enumerate().take(ne + 1) {
955 for &te in te_list {
956 let ea = addr(te);
957 // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
958 let has_cross = te[j] >= 1;
959 let (cross_coef, cross_0) = if has_cross {
960 (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
961 } else {
962 (0.0, 0)
963 };
964 let mmax = l_total - nadeg - k;
965 let dst_0 = sk + eoff_k[ea];
966 let src1_0 = sk1 + eoff_k1[ea];
967 let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
968 for m in 0..=mmax {
969 let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
970 if has2 {
971 v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
972 }
973 if has_cross {
974 v += cross_coef * levels[cross_0 + m + 1];
975 }
976 levels[dst_0 + m] = v;
977 }
978 }
979 }
980 }
981 // Extract level k: contract each f-component's m = 0 slice into c_ef.
982 for &tf in &tri[k] {
983 let lf = cart_index(tf);
984 let af = addr(tf);
985 for ae in 0..n_e {
986 c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
987 }
988 }
989 }
990}
991
992/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
993/// the row-major output block.
994///
995/// Flat-array HGP horizontal recurrence, replacing the earlier
996/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
997/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
998/// term order — so it is **bit-identical** to the recursive version (guarded by
999/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
1000/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
1001///
1002/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
1003/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
1004/// independently per ket index `f` (a spectator), into the `bra` intermediate.
1005/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
1006/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
1007/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
1008/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
1009#[allow(clippy::too_many_arguments)]
1010fn hrr_and_scatter<'a>(
1011 la: usize,
1012 lb: usize,
1013 lc: usize,
1014 ld: usize,
1015 n_f: usize,
1016 c_ef: &[f64],
1017 ab: Vec3,
1018 cd: Vec3,
1019 out: &mut [f64],
1020 bra: &mut Vec<f64>,
1021 mut prev: &'a mut Vec<f64>,
1022 mut cur: &'a mut Vec<f64>,
1023 mut kprev: &'a mut Vec<f64>,
1024 mut kcur: &'a mut Vec<f64>,
1025 // Shared Cartesian-triple table (degrees `0..=2·MAX_L`); HRR indexes it up to
1026 // `max(ne, nf)`. Passed in so it is not re-`collect`ed per quartet.
1027 tri: &[Vec<[usize; 3]>],
1028) {
1029 let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
1030 let ne = la + lb; // max bra (A-side) degree present in c_ef
1031 let nf = lc + ld; // max ket (C-side) degree present in c_ef
1032 let n_e = n_addr(ne);
1033
1034 // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
1035 // the ket base are keyed by the global `addr(f)` offset by `f_base`.
1036 let f_base = tri_below(lc);
1037 let nf_range = n_f - f_base;
1038
1039 // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
1040 // Reused arena buffers; bra is fully overwritten below, the rolling
1041 // layers in the region read — debug NaN-fill bra so a missed write surfaces.
1042 let bra_len = na * nb * nf_range;
1043 ensure_len(bra, bra_len);
1044 #[cfg(debug_assertions)]
1045 bra[..bra_len].fill(f64::NAN);
1046 // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
1047 let layer_len = n_cart(lb) * n_e;
1048 ensure_len(prev, layer_len);
1049 ensure_len(cur, layer_len);
1050 for f_global in f_base..n_f {
1051 let jf = f_global - f_base;
1052 // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
1053 for &a in tri[la..=ne].iter().flatten() {
1054 let ae = addr(a);
1055 prev[ae] = c_ef[ae * n_f + f_global];
1056 }
1057 for kb in 1..=lb {
1058 for (ibw, &b) in tri[kb].iter().enumerate() {
1059 let i = lower_axis(b);
1060 let b1w = cart_index(dec(b, i));
1061 // a-degrees that can still reach (la, lb): [la..=ne−kb].
1062 for &a in tri[la..=(ne - kb)].iter().flatten() {
1063 let ae = addr(a);
1064 let a1e = addr(inc(a, i));
1065 cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
1066 }
1067 }
1068 std::mem::swap(&mut prev, &mut cur);
1069 }
1070 // `prev` now holds b-degree lb (or the base when lb = 0): extract.
1071 for (ib, &b) in tri[lb].iter().enumerate() {
1072 let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
1073 for (ia, &a) in tri[la].iter().enumerate() {
1074 bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
1075 }
1076 }
1077 }
1078
1079 // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
1080 let klayer_len = n_cart(ld) * n_f;
1081 ensure_len(kprev, klayer_len);
1082 ensure_len(kcur, klayer_len);
1083 for ia in 0..na {
1084 for ib in 0..nb {
1085 let brarow = (ia * nb + ib) * nf_range;
1086 // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
1087 for &c in tri[lc..=nf].iter().flatten() {
1088 let ce = addr(c);
1089 kprev[ce] = bra[brarow + (ce - f_base)];
1090 }
1091 for kd in 1..=ld {
1092 for (idw, &d) in tri[kd].iter().enumerate() {
1093 let j = lower_axis(d);
1094 let d1w = cart_index(dec(d, j));
1095 for &c in tri[lc..=(nf - kd)].iter().flatten() {
1096 let ce = addr(c);
1097 let c1e = addr(inc(c, j));
1098 kcur[idw * n_f + ce] =
1099 kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
1100 }
1101 }
1102 std::mem::swap(&mut kprev, &mut kcur);
1103 }
1104 // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
1105 for (ic, &c) in tri[lc].iter().enumerate() {
1106 let ce = addr(c);
1107 for id in 0..nd {
1108 out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
1109 }
1110 }
1111 }
1112 }
1113}
1114
1115// --- small helpers ---
1116
1117/// First nonzero axis of a triple (the lowering direction).
1118#[inline]
1119fn lower_axis(t: [usize; 3]) -> usize {
1120 if t[0] > 0 {
1121 0
1122 } else if t[1] > 0 {
1123 1
1124 } else {
1125 2
1126 }
1127}
1128
1129#[inline]
1130fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
1131 t[i] -= 1;
1132 t
1133}
1134
1135#[inline]
1136fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
1137 t[i] += 1;
1138 t
1139}
1140
1141#[inline]
1142fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
1143 [
1144 (a * ca[0] + b * cb[0]) / p,
1145 (a * ca[1] + b * cb[1]) / p,
1146 (a * ca[2] + b * cb[2]) / p,
1147 ]
1148}
1149
1150#[inline]
1151fn sub(u: Vec3, v: Vec3) -> Vec3 {
1152 [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
1153}
1154
1155#[inline]
1156fn dist2(u: Vec3, v: Vec3) -> f64 {
1157 norm2(sub(u, v))
1158}
1159
1160#[inline]
1161fn norm2(u: Vec3) -> f64 {
1162 u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
1163}
1164
1165#[cfg(test)]
1166mod tests {
1167 use super::*;
1168
1169 /// Single-primitive `ShellRef` helper.
1170 fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
1171 (center, l, [exp], [1.0])
1172 }
1173
1174 /// (ss|ss) over four unit s primitives must equal the closed form
1175 /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
1176 /// engine passes, so the two share a verified base case.
1177 #[test]
1178 fn ssss_matches_closed_form() {
1179 let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
1180 let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
1181 let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
1182 let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
1183 let mut out = [0.0; 1];
1184 coulomb_shell_into(
1185 ShellRef {
1186 center: ac,
1187 l: al,
1188 exps: &ae,
1189 coeffs: &acf,
1190 },
1191 ShellRef {
1192 center: bc,
1193 l: bl,
1194 exps: &be,
1195 coeffs: &bcf,
1196 },
1197 ShellRef {
1198 center: cc,
1199 l: cl,
1200 exps: &ce,
1201 coeffs: &ccf,
1202 },
1203 ShellRef {
1204 center: dc,
1205 l: dl,
1206 exps: &de,
1207 coeffs: &dcf,
1208 },
1209 &mut out,
1210 );
1211
1212 let p = 0.8 + 1.3;
1213 let q = 0.5 + 1.1;
1214 let pcen = combine(0.8, ac, 1.3, bc, p);
1215 let qcen = combine(0.5, cc, 1.1, dc, q);
1216 let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
1217 let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
1218 let rho = p * q / (p + q);
1219 let t = rho * dist2(pcen, qcen);
1220 let mut fm = [0.0; 1];
1221 boys_array(0, t, &mut fm);
1222 use std::f64::consts::PI;
1223 let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
1224 assert!(
1225 (out[0] - expect).abs() < 1e-14 * expect.abs(),
1226 "ssss {} vs {}",
1227 out[0],
1228 expect
1229 );
1230 }
1231
1232 use crate::os::Prim;
1233 use crate::rys::coulomb_into;
1234
1235 /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
1236 /// center)`, for cross-checking against the Rys engine.
1237 #[allow(clippy::too_many_arguments)]
1238 fn os_block(
1239 la: usize,
1240 ea: f64,
1241 ca: Vec3,
1242 lb: usize,
1243 eb: f64,
1244 cb: Vec3,
1245 lc: usize,
1246 recc: f64,
1247 ccc: Vec3,
1248 ld: usize,
1249 ed: f64,
1250 cdd: Vec3,
1251 ) -> Vec<f64> {
1252 let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
1253 let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
1254 let one = [1.0];
1255 coulomb_shell_into(
1256 ShellRef {
1257 center: ca,
1258 l: la,
1259 exps: &ea1,
1260 coeffs: &one,
1261 },
1262 ShellRef {
1263 center: cb,
1264 l: lb,
1265 exps: &eb1,
1266 coeffs: &one,
1267 },
1268 ShellRef {
1269 center: ccc,
1270 l: lc,
1271 exps: &ec1,
1272 coeffs: &one,
1273 },
1274 ShellRef {
1275 center: cdd,
1276 l: ld,
1277 exps: &ed1,
1278 coeffs: &one,
1279 },
1280 &mut out,
1281 );
1282 out
1283 }
1284
1285 /// OS/HGP must reproduce the Rys engine element-for-element on a single
1286 /// primitive quartet, across a sweep of angular momenta (including the
1287 /// bug-prone mixed-high-L and "all four different L on four centers" cases).
1288 #[test]
1289 fn matches_rys_engine_primitive_sweep() {
1290 let ca = [0.0, 0.0, 0.0];
1291 let cb = [0.5, -0.3, 0.2];
1292 let cc = [-0.4, 0.6, -0.1];
1293 let cd = [0.2, 0.4, 0.8];
1294 let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);
1295
1296 let quartets = [
1297 (0usize, 0usize, 0usize, 0usize),
1298 (1, 0, 0, 0),
1299 (0, 0, 1, 0),
1300 (1, 1, 0, 0),
1301 (1, 0, 1, 0),
1302 (1, 1, 1, 1),
1303 (2, 0, 0, 0),
1304 (2, 1, 0, 0),
1305 (2, 1, 2, 1),
1306 (0, 1, 2, 3), // four different L on four centers
1307 (2, 2, 3, 3), // (dd|ff) mixed high-L
1308 (3, 0, 0, 1),
1309 // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
1310 // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
1311 // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
1312 (4, 4, 4, 1), // l_total = 13
1313 (6, 6, 1, 0), // l_total = 13, two i-shells
1314 (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
1315 ];
1316 for (la, lb, lc, ld) in quartets {
1317 let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
1318 let mut rys = vec![0.0; os.len()];
1319 coulomb_into(
1320 Prim::new(ea, ca, la),
1321 Prim::new(eb, cb, lb),
1322 Prim::new(ec, cc, lc),
1323 Prim::new(ed, cd, ld),
1324 1.0,
1325 &mut rys,
1326 );
1327 assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
1328 }
1329 }
1330
1331 /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
1332 /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
1333 /// The atol floor absorbs benign near-cancellation on structurally tiny
1334 /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
1335 /// even though both engines are correct — their dominant elements agree to
1336 /// ~1e-12); the rtol catches any real divergence.
1337 fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
1338 const ATOL: f64 = 1e-11;
1339 const RTOL: f64 = 1e-10;
1340 for (o, r) in os.iter().zip(rys.iter()) {
1341 assert!(
1342 (o - r).abs() <= ATOL + RTOL * r.abs(),
1343 "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
1344 (o - r).abs()
1345 );
1346 }
1347 }
1348
1349 /// HGP early contraction (VRR per primitive, HRR once in contracted space)
1350 /// must equal the per-primitive Rys sum for a genuinely **contracted**
1351 /// quartet.
1352 ///
1353 /// Note: doing the HRR after contraction is a *performance*
1354 /// choice, **not** a correctness fork. The HRR operator is linear with
1355 /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
1356 /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
1357 /// and post-contraction HRR give the *same* answer. This remains a valuable
1358 /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
1359 /// coefficient, or a wrong HRR sign would all fail it), but it does not
1360 /// distinguish HRR *ordering*, because the two orders are algebraically equal.
1361 #[test]
1362 fn contracted_quartet_matches_rys_sum() {
1363 // Two p shells and a d shell, each with multiple primitives, on distinct
1364 // centres (so HRR shift vectors A−B, C−D are all non-trivial).
1365 let ca = [0.0, 0.0, 0.0];
1366 let cb = [0.6, -0.2, 0.1];
1367 let cc = [-0.3, 0.5, -0.2];
1368 let cd = [0.2, 0.3, 0.7];
1369 let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
1370 let ax = [1.4, 0.45];
1371 let acf = [0.6, 0.5];
1372 let bx = [0.9, 0.3];
1373 let bcf = [0.55, 0.5];
1374 let cx = [1.1, 0.4];
1375 let ccf = [0.7, 0.4];
1376 let dx = [0.8];
1377 let dcf = [1.0];
1378
1379 let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
1380 coulomb_shell_into(
1381 ShellRef {
1382 center: ca,
1383 l: la,
1384 exps: &ax,
1385 coeffs: &acf,
1386 },
1387 ShellRef {
1388 center: cb,
1389 l: lb,
1390 exps: &bx,
1391 coeffs: &bcf,
1392 },
1393 ShellRef {
1394 center: cc,
1395 l: lc,
1396 exps: &cx,
1397 coeffs: &ccf,
1398 },
1399 ShellRef {
1400 center: cd,
1401 l: ld,
1402 exps: &dx,
1403 coeffs: &dcf,
1404 },
1405 &mut os,
1406 );
1407
1408 // Reference: sum the Rys primitive engine over the quartet with the same
1409 // effective coefficients.
1410 let mut rys = vec![0.0; os.len()];
1411 for (&ea, &wa) in ax.iter().zip(acf.iter()) {
1412 for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
1413 for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
1414 for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
1415 coulomb_into(
1416 Prim::new(ea, ca, la),
1417 Prim::new(eb, cb, lb),
1418 Prim::new(ec, cc, lc),
1419 Prim::new(ed, cd, ld),
1420 wa * wb * wc * wd,
1421 &mut rys,
1422 );
1423 }
1424 }
1425 }
1426 }
1427
1428 for (o, r) in os.iter().zip(rys.iter()) {
1429 assert!(
1430 (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
1431 "contracted OS vs Rys mismatch: {o} vs {r}"
1432 );
1433 }
1434 }
1435
1436 /// The triple-address map must be a bijection onto `0..n_addr(L)`.
1437 #[test]
1438 fn addr_is_bijective() {
1439 for lmax in 0..=6 {
1440 let mut seen = vec![false; n_addr(lmax)];
1441 for n in 0..=lmax {
1442 for t in cart_components(n) {
1443 let a = addr(t);
1444 assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
1445 assert!(!seen[a], "addr collision at {t:?}");
1446 seen[a] = true;
1447 }
1448 }
1449 assert!(
1450 seen.iter().all(|&x| x),
1451 "addr not surjective at lmax {lmax}"
1452 );
1453 }
1454 }
1455}