integral_core/os_eri.rs
1//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
2//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
3//!
4//! A **vertical recurrence (VRR)** builds the
5//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
6//! the primitives are **contracted** into AO space, and a **horizontal recurrence
7//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
8//! Doing the HRR after contraction is the HGP "early contraction" trick: the
9//! geometry-only HRR runs **once per shell quartet** instead of once per
10//! primitive quartet, which is the win at high contraction degree.
11//!
12//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
13//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
14//! against it (and against an independent McMurchie–Davidson path).
15//!
16//! ## Method
17//!
18//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
19//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
20//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
21//!
22//! ```text
23//! [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
24//! ```
25//!
26//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
27//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
28//!
29//! ```text
30//! [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
31//! + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
32//! + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
33//! [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
34//! + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
35//! + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
36//! ```
37//!
38//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
39//! Cartesian shell block:
40//!
41//! ```text
42//! (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0) [bra, A→B]
43//! (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d) [ket, C→D]
44//! ```
45//!
46//! ## Buffers
47//!
48//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
49//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
50//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
51//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
52//! `(ii|ii)`. The current engine removes that:
53//!
54//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
55//! `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
56//! of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
57//! index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
58//! `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
59//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
60//! / [`cart_index`] with two rolling degree layers, replacing the former HashMap
61//! memoisation.
62//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
63//! across quartets (thread-local by default), not re-allocated per quartet.
64//!
65//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
66//! former full-table engine's values, cross-checked against the Rys engine and an
67//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).
68
69use std::cell::RefCell;
70
71use integral_math::am::{cart_components, cart_index, n_cart};
72use integral_math::boys::{boys_array, boys_array2, boys_array4};
73
74use crate::os::{Vec3, MAX_L};
75
76/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
77/// momentum, and primitive `(exponent, effective-coefficient)` data. The
78/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
79/// engine itself works on un-normalised monomials and only multiplies the four
80/// coefficients into the contracted accumulator.
81#[derive(Debug, Clone, Copy)]
82pub struct ShellRef<'a> {
83 /// Shell centre (bohr).
84 pub center: Vec3,
85 /// Angular momentum.
86 pub l: usize,
87 /// Primitive exponents.
88 pub exps: &'a [f64],
89 /// Effective contraction coefficients, aligned with `exps`.
90 pub coeffs: &'a [f64],
91}
92
93/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
94#[inline]
95fn n_addr(lmax: usize) -> usize {
96 (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
97}
98
99/// Number of Cartesian triples of degree **strictly less than** `d`:
100/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
101/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
102#[inline]
103fn tri_below(d: usize) -> usize {
104 d * (d + 1) * (d + 2) / 6
105}
106
107/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
108/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
109/// `0..n_addr(deg)`.
110#[inline]
111fn addr(t: [usize; 3]) -> usize {
112 let n = t[0] + t[1] + t[2];
113 let base = n * (n + 1) * (n + 2) / 6;
114 // within-degree index: (n-lx)(n-lx+1)/2 + lz
115 let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
116 base + within
117}
118
119/// Reusable scratch arena for the OS/HGP ERI engine.
120///
121/// Holds the engine's working buffers so they are allocated **once and reused**
122/// across shell quartets, instead of a fresh heap allocation per quartet inside the
123/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
124/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
125/// thread uses its own instance — the thread-local default behind
126/// [`coulomb_shell_into`], or one passed explicitly to
127/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
128/// error).
129///
130/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
131/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
132/// they are later read, so they need no functional zeroing — but in debug builds
133/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
134/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
135/// rather than silently reading a stale value from a previous quartet. Results are
136/// therefore independent of evaluation order and of arena reuse (asserted by
137/// `tests/arena.rs`).
138#[derive(Debug, Default, Clone)]
139pub struct EriScratch {
140 /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
141 levels: Vec<f64>,
142 /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
143 c_ef: Vec<f64>,
144 /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
145 bra: Vec<f64>,
146 /// HRR bra rolling `b`-degree layers.
147 bra_prev: Vec<f64>,
148 bra_cur: Vec<f64>,
149 /// HRR ket rolling `d`-degree layers.
150 ket_prev: Vec<f64>,
151 ket_cur: Vec<f64>,
152 /// Precomputed bra/ket primitive-pair data (combined exponent, product
153 /// centre, `K` prefactor, coeff product) — built once per shell quartet so
154 /// the per-pair `exp`/centre work is not repeated inside the deep primitive
155 /// loop. Reused across quartets like the other buffers.
156 bra_pairs: Vec<PrimPair>,
157 ket_pairs: Vec<PrimPair>,
158 /// Shared Cartesian-triple table `tri_all[d] = cart_components(d)` for every
159 /// degree the engine needs (`0..=2·MAX_L`). Built once and sliced for the VRR
160 /// `e`/`f` lists and the HRR triples, instead of re-`collect`ing per quartet.
161 tri_all: Vec<Vec<[usize; 3]>>,
162 /// Flat VRR offset table: `eoff[k·n_e + ae]` is `e`'s packed offset in level
163 /// `k`. Rebuilt per quartet into this reused buffer (no per-quartet `Vec<Vec>`
164 /// allocation); `slab[k]` is one `f`-component's packed size for level `k`.
165 eoff: Vec<usize>,
166 slab: Vec<usize>,
167}
168
169/// One primitive pair's reusable geometry/prefactor, shared by every primitive
170/// quartet that uses it: the combined exponent `zeta = α+β`, the Gaussian product
171/// centre, the overlap prefactor `K = exp(−αβ/ζ·|A−B|²)`, and the two contraction
172/// coefficients (`c1` outer, `c2` inner).
173///
174/// The coefficients are kept **un-multiplied** so the contracted `scale` can be
175/// formed in exactly the former evaluation order `((c_a·c_b)·c_c)·c_d` — making the
176/// engine's output bit-identical to the pre-precomputation code (the per-element
177/// and 8-fold-symmetry tests are floorless-relative and sensitive to the last ULP
178/// on near-cancellation elements).
179#[derive(Debug, Clone, Copy, Default)]
180struct PrimPair {
181 zeta: f64,
182 center: Vec3,
183 kappa: f64,
184 c1: f64,
185 c2: f64,
186 /// `1/(2ζ)` — the VRR `inv_2p`(bra)/`inv_2q`(ket) coefficient. Pure per-pair.
187 inv_2zeta: f64,
188 /// `centre − s1.centre` — the VRR `P−A`(bra)/`Q−C`(ket) shift. Pure per-pair.
189 r1: Vec3,
190}
191
192/// A primitive pair whose weighted overlap prefactor `K·|c₁·c₂|` falls below
193/// this bound contributes at most ~`1e-24` (absolutely) to any tensor element —
194/// roughly twelve orders below the engine's `1e-12` accuracy bar — so it is
195/// dropped from the pair list before the quartet loop. Every `[e0|f0]^(m)` term
196/// the pair could produce is bounded by `K·|c₁·c₂|` times the other pair's
197/// weight (≤ its own bound), the `2π^{5/2}/(pq√(p+q))` prefactor, `F_m ≤ 1`,
198/// and the VRR's geometric raise factors; the product stays ≤ ~1e-24 for any
199/// chemically sensible exponents/geometry. The deeply contracted cross-centre
200/// pairs this removes have `K` underflowing to `0.0` outright in typical bases
201/// (tight×tight pairs at bonding distances reach `exp(-10⁴)`), where skipping
202/// is *exactly* lossless.
203const PAIR_NEGLIGIBLE: f64 = 1e-32;
204
205/// Fill `out` with the [`PrimPair`] data for every primitive combination of two
206/// shells (outer `s1`, inner `s2`), matching the former in-loop computation
207/// element-for-element. The inter-centre distance is hoisted out of the pair loop.
208/// `inv_2zeta` and `r1` are the pair-only VRR coefficients (`1/2ζ` and the `P−A` /
209/// `Q−C` shift) that the kernel formerly recomputed on every primitive quartet.
210/// Pairs below [`PAIR_NEGLIGIBLE`] are omitted; the surviving pairs keep their
211/// order, so the contraction accumulates the remaining terms in the same
212/// sequence as before.
213fn build_pairs(out: &mut Vec<PrimPair>, s1: ShellRef<'_>, s2: ShellRef<'_>) {
214 out.clear();
215 out.reserve(s1.exps.len() * s2.exps.len());
216 let d2 = dist2(s1.center, s2.center);
217 for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
218 for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
219 let zeta = e1 + e2;
220 let kappa = (-(e1 * e2 / zeta) * d2).exp();
221 if kappa * (c1 * c2).abs() < PAIR_NEGLIGIBLE {
222 continue;
223 }
224 let center = combine(e1, s1.center, e2, s2.center, zeta);
225 out.push(PrimPair {
226 zeta,
227 center,
228 kappa,
229 c1,
230 c2,
231 inv_2zeta: 0.5 / zeta,
232 r1: sub(center, s1.center),
233 });
234 }
235 }
236}
237
238/// Precomputed, screened [`PrimPair`] list of one **ordered** shell pair —
239/// libcint's pair-data ("optimizer") precompute. A dense driver builds one per
240/// canonical shell pair once per build (`O(n_shells²)`, trivial memory) and
241/// passes borrowed lists to [`coulomb_shell_pairs_into_scratch`] /
242/// [`coulomb_shell_batch4_pairs_into_scratch`], instead of the engine rebuilding
243/// the same pairs on every quartet that shares the pair.
244///
245/// The contents and order are exactly what the self-building entries compute
246/// internally ([`build_pairs`]), so routing through borrowed lists is
247/// bit-identical. **Orientation matters**: `build_pairs(s1, s2)` is
248/// order-sensitive (`r1 = P − A` uses `s1`'s centre; `c1`/`c2` keep the
249/// `s1`-outer/`s2`-inner coefficient association), so a list built for `(i, j)`
250/// must only be used where the engine would have built `(i, j)` — the canonical
251/// drivers use the `i ≥ j` orientation for both bra and ket throughout.
252#[derive(Debug, Clone, Default)]
253pub struct ShellPairData {
254 pairs: Vec<PrimPair>,
255}
256
257impl ShellPairData {
258 /// Number of surviving primitive pairs (the [`PAIR_NEGLIGIBLE`] screen
259 /// applied) — what [`surviving_pair_count`] returns for the same pair.
260 #[must_use]
261 pub fn len(&self) -> usize {
262 self.pairs.len()
263 }
264
265 /// `true` when no primitive pair survives the negligibility screen.
266 #[must_use]
267 pub fn is_empty(&self) -> bool {
268 self.pairs.is_empty()
269 }
270}
271
272/// Build the screened pair list for the **ordered** shell pair `(s1, s2)` —
273/// element-for-element what the self-building engine entries compute per
274/// quartet (see [`ShellPairData`] for the orientation contract).
275#[must_use]
276pub fn shell_pair_data(s1: ShellRef<'_>, s2: ShellRef<'_>) -> ShellPairData {
277 let mut pairs = Vec::new();
278 build_pairs(&mut pairs, s1, s2);
279 ShellPairData { pairs }
280}
281
282impl EriScratch {
283 /// A fresh, empty arena; it grows to fit on first use.
284 #[must_use]
285 pub fn new() -> Self {
286 Self::default()
287 }
288
289 /// Total `f64` elements currently held across all buffers — the resident
290 /// working set, for memory reporting/tests.
291 #[must_use]
292 pub fn resident_f64(&self) -> usize {
293 self.levels.len()
294 + self.c_ef.len()
295 + self.bra.len()
296 + self.bra_prev.len()
297 + self.bra_cur.len()
298 + self.ket_prev.len()
299 + self.ket_cur.len()
300 }
301
302 /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
303 /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
304 /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
305 #[must_use]
306 pub fn largest_buffer_f64(&self) -> usize {
307 [
308 self.levels.len(),
309 self.c_ef.len(),
310 self.bra.len(),
311 self.bra_prev.len(),
312 self.bra_cur.len(),
313 self.ket_prev.len(),
314 self.ket_cur.len(),
315 ]
316 .into_iter()
317 .max()
318 .unwrap_or(0)
319 }
320}
321
322thread_local! {
323 /// Per-thread default arena backing [`coulomb_shell_into`].
324 static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
325}
326
327/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
328#[inline]
329fn ensure_len(v: &mut Vec<f64>, n: usize) {
330 if v.len() < n {
331 v.resize(n, 0.0);
332 }
333}
334
335/// `ensure_len` for the `usize` offset/slab buffers.
336#[inline]
337fn ensure_usize(v: &mut Vec<usize>, n: usize) {
338 if v.len() < n {
339 v.resize(n, 0);
340 }
341}
342
343/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
344/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
345/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
346///
347/// The engine owns the primitive-quartet loop (the HGP contraction happens between
348/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
349/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
350/// explicit control (e.g. one arena per worker thread) call
351/// [`coulomb_shell_into_scratch`].
352///
353/// # Panics
354/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
355pub fn coulomb_shell_into(
356 a: ShellRef<'_>,
357 b: ShellRef<'_>,
358 c: ShellRef<'_>,
359 d: ShellRef<'_>,
360 out: &mut [f64],
361) {
362 ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
363}
364
365/// Like [`coulomb_shell_into`] (thread-local arena) but with the bra/ket
366/// primitive-pair data supplied by the caller — the borrowed-pairs analogue,
367/// see [`ShellPairData`]. Bit-identical to [`coulomb_shell_into`].
368///
369/// # Panics
370/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
371pub fn coulomb_shell_pairs_into(
372 a: ShellRef<'_>,
373 b: ShellRef<'_>,
374 c: ShellRef<'_>,
375 d: ShellRef<'_>,
376 bra_pairs: &ShellPairData,
377 ket_pairs: &ShellPairData,
378 out: &mut [f64],
379) {
380 ERI_SCRATCH.with(|s| {
381 coulomb_shell_pairs_into_scratch(
382 &mut s.borrow_mut(),
383 a,
384 b,
385 c,
386 d,
387 bra_pairs,
388 ket_pairs,
389 out,
390 );
391 });
392}
393
394/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
395/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
396/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
397/// compile error); the result is bit-identical regardless of which arena is used or
398/// what it last held.
399///
400/// # Panics
401/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
402pub fn coulomb_shell_into_scratch(
403 scratch: &mut EriScratch,
404 a: ShellRef<'_>,
405 b: ShellRef<'_>,
406 c: ShellRef<'_>,
407 d: ShellRef<'_>,
408 out: &mut [f64],
409) {
410 // Build the pair lists into the arena's reused buffers, then run the core
411 // on borrowed slices. The vecs are moved out for the duration of the call
412 // (a pointer swap) so the core can take `&mut` of the rest of the arena.
413 let mut bra_pairs = std::mem::take(&mut scratch.bra_pairs);
414 let mut ket_pairs = std::mem::take(&mut scratch.ket_pairs);
415 build_pairs(&mut bra_pairs, a, b);
416 build_pairs(&mut ket_pairs, c, d);
417 coulomb_shell_core(scratch, a, b, c, d, &bra_pairs, &ket_pairs, out);
418 scratch.bra_pairs = bra_pairs;
419 scratch.ket_pairs = ket_pairs;
420}
421
422/// Like [`coulomb_shell_into_scratch`] but with the bra/ket primitive-pair data
423/// supplied by the caller (precomputed once per shell pair across a dense
424/// build, see [`ShellPairData`]), instead of rebuilt per quartet. Bit-identical:
425/// the pair values and their order are exactly what the self-building entry
426/// computes — only *where* they are computed moves.
427///
428/// # Panics
429/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
430#[allow(clippy::too_many_arguments)]
431pub fn coulomb_shell_pairs_into_scratch(
432 scratch: &mut EriScratch,
433 a: ShellRef<'_>,
434 b: ShellRef<'_>,
435 c: ShellRef<'_>,
436 d: ShellRef<'_>,
437 bra_pairs: &ShellPairData,
438 ket_pairs: &ShellPairData,
439 out: &mut [f64],
440) {
441 coulomb_shell_core(scratch, a, b, c, d, &bra_pairs.pairs, &ket_pairs.pairs, out);
442}
443
444/// The quartet kernel shared by the self-building and borrowed-pairs entries:
445/// fast path / VRR dispatch / HRR over caller-provided pair slices.
446#[allow(clippy::too_many_arguments)]
447fn coulomb_shell_core(
448 scratch: &mut EriScratch,
449 a: ShellRef<'_>,
450 b: ShellRef<'_>,
451 c: ShellRef<'_>,
452 d: ShellRef<'_>,
453 bra_pairs: &[PrimPair],
454 ket_pairs: &[PrimPair],
455 out: &mut [f64],
456) {
457 let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
458 debug_assert!(
459 la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
460 "angular momentum exceeds MAX_L"
461 );
462 let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
463 debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");
464
465 let ne = la + lb; // max bra (A-side) degree built by VRR
466 let nf = lc + ld; // max ket (C-side) degree built by VRR
467 let l_total = ne + nf;
468 let n_e = n_addr(ne);
469 let n_f = n_addr(nf);
470
471 let EriScratch {
472 levels,
473 c_ef,
474 bra,
475 bra_prev,
476 bra_cur,
477 ket_prev,
478 ket_cur,
479 tri_all,
480 eoff,
481 slab,
482 ..
483 } = scratch;
484
485 // Fast path: (ss|ss). With no angular momentum to raise, the whole quartet is a
486 // single contracted [00|00]^(0). Skip the VRR/HRR machinery (the dead raise
487 // coefficients, the `levels` buffer round-trip, the per-primitive call) and sum
488 // straight into a register. Bit-identical to the general path: same `pref`/`t`
489 // expressions, same `((c_a·c_b)·c_c)·c_d` scale, same `scale·(pref·F_0)` term,
490 // same bra-outer/ket-inner order — just without the scaffolding.
491 if l_total == 0 {
492 let two_pi_2_5 =
493 2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
494 let mut acc = 0.0;
495 let mut f0 = [0.0f64; 1];
496 let mut f1 = [0.0f64; 1];
497 for bra in bra_pairs.iter() {
498 let bc = bra.c1 * bra.c2;
499 let p = bra.zeta;
500 // Kets in PAIRS through the interleaved `boys_array2` (per-lane
501 // arithmetic and the lane-0-then-lane-1 accumulation order are
502 // exactly the former sequential loop's, so this is bit-identical).
503 let mut kets = ket_pairs.chunks_exact(2);
504 for pair in kets.by_ref() {
505 let (k0, k1) = (&pair[0], &pair[1]);
506 let pq0 = p + k0.zeta;
507 let pq1 = p + k1.zeta;
508 let t0 = (p * k0.zeta / pq0) * dist2(bra.center, k0.center);
509 let t1 = (p * k1.zeta / pq1) * dist2(bra.center, k1.center);
510 let pref0 = two_pi_2_5 / (p * k0.zeta * pq0.sqrt()) * bra.kappa * k0.kappa;
511 let pref1 = two_pi_2_5 / (p * k1.zeta * pq1.sqrt()) * bra.kappa * k1.kappa;
512 boys_array2(0, t0, t1, &mut f0, &mut f1);
513 acc += (bc * k0.c1) * k0.c2 * (pref0 * f0[0]);
514 acc += (bc * k1.c1) * k1.c2 * (pref1 * f1[0]);
515 }
516 for ket in kets.remainder() {
517 let q = ket.zeta;
518 let pq = p + q;
519 let t = (p * q / pq) * dist2(bra.center, ket.center);
520 let pref = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
521 boys_array(0, t, &mut f0);
522 acc += (bc * ket.c1) * ket.c2 * (pref * f0[0]);
523 }
524 }
525 out[0] += acc;
526 return;
527 }
528
529 // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
530 ensure_len(c_ef, n_e * n_f);
531 c_ef[..n_e * n_f].fill(0.0);
532
533 // Shared Cartesian-triple table, built once (degrees `0..=2·MAX_L` cover every
534 // `ne`/`nf`/`max(ne,nf)` the engine reaches) and sliced below — no per-quartet
535 // `cart_components` allocation.
536 if tri_all.len() <= 2 * MAX_L {
537 *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
538 }
539
540 // VRR. The small classes (`ne, nf ≤ 3` — every quartet shape of an
541 // spd basis except the d,d-bra/ket pairs, ~96% of a cc-pVDZ-style build)
542 // dispatch to the monomorphized kernels, where the whole structural walk
543 // (triples, addresses, branches, m-ranges) is resolved at compile time;
544 // everything else runs the general m-marching `vrr_primitive`.
545 // `(0,0)` already returned through the (ss|ss) fast path above.
546 match (ne, nf) {
547 (0, 1) => contract_class::<0, 1>(bra_pairs, ket_pairs, c_ef),
548 (1, 0) => contract_class::<1, 0>(bra_pairs, ket_pairs, c_ef),
549 (1, 1) => contract_class::<1, 1>(bra_pairs, ket_pairs, c_ef),
550 (0, 2) => contract_class::<0, 2>(bra_pairs, ket_pairs, c_ef),
551 (2, 0) => contract_class::<2, 0>(bra_pairs, ket_pairs, c_ef),
552 (1, 2) => contract_class::<1, 2>(bra_pairs, ket_pairs, c_ef),
553 (2, 1) => contract_class::<2, 1>(bra_pairs, ket_pairs, c_ef),
554 (2, 2) => contract_class::<2, 2>(bra_pairs, ket_pairs, c_ef),
555 (0, 3) => contract_class::<0, 3>(bra_pairs, ket_pairs, c_ef),
556 (3, 0) => contract_class::<3, 0>(bra_pairs, ket_pairs, c_ef),
557 (1, 3) => contract_class::<1, 3>(bra_pairs, ket_pairs, c_ef),
558 (3, 1) => contract_class::<3, 1>(bra_pairs, ket_pairs, c_ef),
559 (2, 3) => contract_class::<2, 3>(bra_pairs, ket_pairs, c_ef),
560 (3, 2) => contract_class::<3, 2>(bra_pairs, ket_pairs, c_ef),
561 (3, 3) => contract_class::<3, 3>(bra_pairs, ket_pairs, c_ef),
562 _ => {
563 // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
564 // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
565 // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
566 // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
567 // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
568 // packed size and `eoff[k·n_e + ae]` is `e`'s offset within it (flat, reused).
569 ensure_usize(slab, nf + 1);
570 ensure_usize(eoff, (nf + 1) * n_e);
571 // `k` indexes both `slab` and the flat `eoff` block base `k·n_e`, so a range
572 // loop is the clear form here.
573 #[allow(clippy::needless_range_loop)]
574 for k in 0..=nf {
575 let base = k * n_e;
576 let mut run = 0usize;
577 let mut ae = 0usize;
578 for de in 0..=ne {
579 let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
580 for _ in 0..n_cart(de) {
581 eoff[base + ae] = run;
582 ae += 1;
583 run += mlen;
584 }
585 }
586 slab[k] = run;
587 }
588 let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
589 ensure_len(levels, 3 * maxlevel);
590 // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
591 // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
592 // instead of silently using a previous quartet's value.
593 #[cfg(debug_assertions)]
594 levels[..3 * maxlevel].fill(f64::NAN);
595
596 // Reborrow the offset/triple buffers as shared slices for the read-only kernel.
597 let (eoff_s, slab_s, tri_s) = (&eoff[..], &slab[..], &tri_all[..]);
598 for bra in bra_pairs.iter() {
599 let bra_coef = bra.c1 * bra.c2; // = c_a·c_b, hoisted out of the ket loop
600 for ket in ket_pairs.iter() {
601 // ((c_a·c_b)·c_c)·c_d — the former left-to-right product, bit for bit.
602 let scale = (bra_coef * ket.c1) * ket.c2;
603 vrr_primitive(
604 bra.zeta,
605 ket.zeta,
606 bra.center,
607 ket.center,
608 bra.kappa,
609 ket.kappa,
610 bra.r1,
611 ket.r1,
612 bra.inv_2zeta,
613 ket.inv_2zeta,
614 ne,
615 nf,
616 l_total,
617 n_e,
618 n_f,
619 maxlevel,
620 eoff_s,
621 slab_s,
622 tri_s,
623 levels,
624 scale,
625 c_ef,
626 );
627 }
628 }
629 }
630 }
631
632 // HRR in contracted space, then scatter the block.
633 let ab = sub(a.center, b.center); // A − B
634 let cd = sub(c.center, d.center); // C − D
635 hrr_and_scatter(
636 la,
637 lb,
638 lc,
639 ld,
640 n_f,
641 c_ef,
642 ab,
643 cd,
644 out,
645 bra,
646 bra_prev,
647 bra_cur,
648 ket_prev,
649 ket_cur,
650 &tri_all[..],
651 );
652}
653
654/// Cartesian triples of degree 1, 2 and 3 in [`cart_components`] order, as `const`
655/// tables so the monomorphized small-class VRR kernels fully unroll over them
656/// (the structural walk — axes, addresses, `has2`/cross branches — then resolves
657/// at compile time instead of being re-derived on every primitive quartet).
658const TRI1: [[usize; 3]; 3] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]];
659const TRI2: [[usize; 3]; 6] = [
660 [2, 0, 0],
661 [1, 1, 0],
662 [1, 0, 1],
663 [0, 2, 0],
664 [0, 1, 1],
665 [0, 0, 2],
666];
667const TRI3: [[usize; 3]; 10] = [
668 [3, 0, 0],
669 [2, 1, 0],
670 [2, 0, 1],
671 [1, 2, 0],
672 [1, 1, 1],
673 [1, 0, 2],
674 [0, 3, 0],
675 [0, 2, 1],
676 [0, 1, 2],
677 [0, 0, 3],
678];
679/// The e-components `0..n_addr(3)` in [`addr`] order with their degree —
680/// the bra index set of the small-class kernels (`NE ≤ 3`).
681const E_COMP: [([usize; 3], usize); 20] = [
682 ([0, 0, 0], 0),
683 ([1, 0, 0], 1),
684 ([0, 1, 0], 1),
685 ([0, 0, 1], 1),
686 ([2, 0, 0], 2),
687 ([1, 1, 0], 2),
688 ([1, 0, 1], 2),
689 ([0, 2, 0], 2),
690 ([0, 1, 1], 2),
691 ([0, 0, 2], 2),
692 ([3, 0, 0], 3),
693 ([2, 1, 0], 3),
694 ([2, 0, 1], 3),
695 ([1, 2, 0], 3),
696 ([1, 1, 1], 3),
697 ([1, 0, 2], 3),
698 ([0, 3, 0], 3),
699 ([0, 2, 1], 3),
700 ([0, 1, 2], 3),
701 ([0, 0, 3], 3),
702];
703
704/// Per-primitive-quartet scalar header consumed by [`vrr_small`]: everything the
705/// kernel needs that depends on the *pair of pairs* (not on either pair alone).
706/// The expressions are the former `vrr_small` prologue, verbatim — only *where*
707/// they are evaluated moves (into the strip loop of [`contract_class`]).
708#[derive(Clone, Copy, Default)]
709struct QuartetHeader {
710 wp: [f64; 3], // W − P
711 wq: [f64; 3], // W − Q
712 pref: f64,
713 inv_2pq: f64,
714 q_over_pq: f64,
715 p_over_pq: f64,
716 scale: f64, // ((c_a·c_b)·c_c)·c_d
717 t: f64, // Boys argument ρ·|P−Q|²
718}
719
720/// Ket-strip width of the strip-mined primitive loop in [`contract_class`].
721const STRIP: usize = 4;
722
723/// Contract a whole shell quartet of VRR shape `(NE, NF)` (with `NE, NF ≤ 3`)
724/// into `c_ef` using the monomorphized small-class kernel [`vrr_small`].
725///
726/// Same bra-outer / ket-inner primitive order and the same left-to-right
727/// `((c_a·c_b)·c_c)·c_d` scale as the general path, so the accumulation into
728/// `c_ef` is bit-identical. The ket loop is **strip-mined**: for a strip of up
729/// to [`STRIP`] ket pairs, first compute every scalar header (the serial
730/// sqrt/divide chain) plus its Boys row in one lane loop, then run the VRR
731/// body per lane — breaking the per-quartet header→Boys→VRR dependency chain
732/// so a lane's Boys/header work overlaps another lane's VRR in the pipeline.
733/// (A measured note: splitting Boys into its own third pass *regressed* —
734/// the extra `t` store/reload and loop overhead cost more than the split
735/// bought; the two-pass form is what wins.) Each lane's arithmetic
736/// (expressions, evaluation order, accumulation order into `c_ef`) is
737/// unchanged, so the result stays bit-identical.
738/// The VRR working tables live here (overwritten per primitive
739/// quartet in the region read, like the general path's `levels` arena) so
740/// their one-time zero-init is per shell quartet, not per primitive.
741fn contract_class<const NE: usize, const NF: usize>(
742 bra_pairs: &[PrimPair],
743 ket_pairs: &[PrimPair],
744 c_ef: &mut [f64],
745) {
746 // Worst-case (NE,NF)=(3,3) sizes: e-table 20 components × m ∈ 0..=6;
747 // f-degree-1 level 20×3 × m ∈ 0..=5; f-degree-2 level 20×6 × m ∈ 0..=4;
748 // f-degree-3 level 20×10 × m ∈ 0..=3.
749 let mut ea = [0.0f64; 140];
750 let mut eb1 = [0.0f64; 360];
751 let mut eb2 = [0.0f64; 600];
752 let mut eb3 = [0.0f64; 800];
753 let lt = NE + NF;
754 let mut hdr = [QuartetHeader::default(); STRIP];
755 let mut fm = [[0.0f64; 7]; STRIP];
756 for bra in bra_pairs {
757 let bra_coef = bra.c1 * bra.c2;
758 let (p, pc) = (bra.zeta, bra.center);
759 let mut start = 0;
760 while start < ket_pairs.len() {
761 let strip = &ket_pairs[start..(start + STRIP).min(ket_pairs.len())];
762 // Phase 1 — headers + Boys rows: identical expressions to
763 // `vrr_primitive`, independent iterations. Lanes are processed in
764 // PAIRS so the two Boys ladders evaluate through the manually
765 // interleaved [`boys_array2`] (two overlapping Horner/recurrence
766 // chains); each lane's own arithmetic is unchanged.
767 let header = |h: &mut QuartetHeader, ket: &PrimPair| {
768 let (q, qc) = (ket.zeta, ket.center);
769 let pq = p + q;
770 let rho = p * q / pq;
771 let pq_vec = sub(pc, qc); // P − Q
772 h.t = rho * norm2(pq_vec);
773 let w = [
774 (p * pc[0] + q * qc[0]) / pq,
775 (p * pc[1] + q * qc[1]) / pq,
776 (p * pc[2] + q * qc[2]) / pq,
777 ];
778 h.wp = sub(w, pc);
779 h.wq = sub(w, qc);
780 use std::f64::consts::PI;
781 h.pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
782 h.inv_2pq = 0.5 / pq;
783 h.q_over_pq = q / pq;
784 h.p_over_pq = p / pq;
785 h.scale = (bra_coef * ket.c1) * ket.c2;
786 };
787 let len = strip.len();
788 let mut k = 0;
789 while k + 2 <= len {
790 header(&mut hdr[k], &strip[k]);
791 header(&mut hdr[k + 1], &strip[k + 1]);
792 let (f0, f1) = fm.split_at_mut(k + 1);
793 boys_array2(
794 lt,
795 hdr[k].t,
796 hdr[k + 1].t,
797 &mut f0[k][..=lt],
798 &mut f1[0][..=lt],
799 );
800 k += 2;
801 }
802 if k < len {
803 header(&mut hdr[k], &strip[k]);
804 boys_array(lt, hdr[k].t, &mut fm[k][..=lt]);
805 }
806 // Phase 2 — VRR body per lane, fed from the precomputed header.
807 for (k, ket) in strip.iter().enumerate() {
808 vrr_small::<NE, NF>(
809 bra, ket, &hdr[k], &fm[k], &mut ea, &mut eb1, &mut eb2, &mut eb3, c_ef,
810 );
811 }
812 start += STRIP;
813 }
814 }
815}
816
817/// One primitive quartet of the OS VRR for the small classes `NE, NF ≤ 3`,
818/// monomorphized per `(NE, NF)` — a per-L-class specialized kernel expressed
819/// through const generics: every loop bound, Cartesian triple, address, stride,
820/// and `has2`/cross branch below is a compile-time constant in each
821/// instantiation, so the emitted code is a straight unrolled FMA chain.
822///
823/// **Bit-identical to [`vrr_primitive`] by construction**: the same recurrence
824/// expressions with the same term order and association — base
825/// `pref·F_m`, raise `pa_i·s1[m] + wp_i·s1[m+1]`, then `+= coef2·(s2[m] −
826/// (q/pq)·s2[m+1])`, then `+= cross·s3[m+1]` — the same m-ranges
827/// (`m ≤ l_total − de − k`) for every *consumed* value, and the same
828/// `c_ef[ae·n_f + af] += scale·v` extraction. Two things differ, neither of
829/// which touches a kept value's arithmetic: the storage (fixed per-degree
830/// tables instead of the triangle-packed rolling window), and the final
831/// f-level computing only its consumed `m = 0` row (the general path also
832/// fills the dead `m > 0` triangle there).
833/// Guarded by the full-tensor XOR fingerprint and the golden/cross-engine tests.
834///
835/// Per-degree row strides: the m-rows of `ea` hold `m ∈ 0..=lt` (`sa = lt+1`),
836/// of `eb1` `m ∈ 0..=lt−1` (`s1 = lt`), of `eb2` `m ∈ 0..=lt−2`, of `eb3`
837/// `m ∈ 0..=lt−3` — each level's longest row (its degree-0 e-component).
838#[inline(always)]
839#[allow(clippy::too_many_arguments)]
840fn vrr_small<const NE: usize, const NF: usize>(
841 bra: &PrimPair,
842 ket: &PrimPair,
843 hdr: &QuartetHeader,
844 fm: &[f64; 7],
845 ea: &mut [f64; 140],
846 eb1: &mut [f64; 360],
847 eb2: &mut [f64; 600],
848 eb3: &mut [f64; 800],
849 c_ef: &mut [f64],
850) {
851 let lt = NE + NF;
852 let n_e = n_addr(NE);
853 let n_f = n_addr(NF);
854 // Row strides (compile-time constants per instantiation).
855 let sa = lt + 1;
856 let s1 = lt;
857 let s2 = lt.saturating_sub(1);
858 let s3 = lt.saturating_sub(2);
859 let pa = bra.r1; // P − A
860 let qcen = ket.r1; // Q − C
861 let (inv_2p, inv_2q) = (bra.inv_2zeta, ket.inv_2zeta);
862
863 // Scalar header + Boys row: precomputed by `contract_class` (phases 1–2).
864 let QuartetHeader {
865 wp,
866 wq,
867 pref,
868 inv_2pq,
869 q_over_pq,
870 p_over_pq,
871 scale,
872 t: _,
873 } = *hdr;
874 for m in 0..=lt {
875 ea[m] = pref * fm[m];
876 }
877
878 // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
879 if NE >= 1 {
880 for (iw, te) in TRI1.iter().enumerate() {
881 let i = lower_axis(*te);
882 // Source is [0,0,0] at addr 0; no has2 term at degree 1.
883 for m in 0..=(lt - 1) {
884 ea[(1 + iw) * sa + m] = pa[i] * ea[m] + wp[i] * ea[m + 1];
885 }
886 }
887 }
888 if NE >= 2 {
889 for (iw, te) in TRI2.iter().enumerate() {
890 let i = lower_axis(*te);
891 let s1a = addr(dec(*te, i));
892 let has2 = te[i] >= 2;
893 let coef2 = ((te[i] - 1) as f64) * inv_2p;
894 let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
895 for m in 0..=(lt - 2) {
896 let mut v = pa[i] * ea[s1a * sa + m] + wp[i] * ea[s1a * sa + m + 1];
897 if has2 {
898 v += coef2 * (ea[s2a * sa + m] - q_over_pq * ea[s2a * sa + m + 1]);
899 }
900 ea[(4 + iw) * sa + m] = v;
901 }
902 }
903 }
904 if NE >= 3 {
905 for (iw, te) in TRI3.iter().enumerate() {
906 let i = lower_axis(*te);
907 let s1a = addr(dec(*te, i));
908 let has2 = te[i] >= 2;
909 let coef2 = ((te[i] - 1) as f64) * inv_2p;
910 let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
911 for m in 0..=(lt - 3) {
912 let mut v = pa[i] * ea[s1a * sa + m] + wp[i] * ea[s1a * sa + m + 1];
913 if has2 {
914 v += coef2 * (ea[s2a * sa + m] - q_over_pq * ea[s2a * sa + m + 1]);
915 }
916 ea[(10 + iw) * sa + m] = v;
917 }
918 }
919 }
920 // Extract f-degree 0: contract m = 0 of every e-component.
921 for ae in 0..n_e {
922 c_ef[ae * n_f] += scale * ea[ae * sa];
923 }
924
925 // Phase B — ket raise, f-degree k: backward liveness gives every f-level-k
926 // row a needed m-range of `m ≤ NF − k`, independent of the e-degree: the
927 // final level (k = NF) only feeds the m = 0 extraction, and each level
928 // below it is read at m, m+1 by the next level's raise (its cross and the
929 // k+2 has2 reach no further). The general path's full triangle
930 // (`m ≤ lt − de − k`) computes `NE − de` dead values per row beyond that;
931 // skipping them removes whole computations and leaves every kept value
932 // bit-identical (fingerprint-guarded). Phase A has no such slack — its
933 // liveness works out to exactly the `lt − de` triangle it computes.
934 if NF >= 1 {
935 for (fw, tf) in TRI1.iter().enumerate() {
936 let j = lower_axis(*tf);
937 // Source f-component is [0,0,0] (level 0 = the phase-A table).
938 for ae in 0..n_e {
939 let (te, _) = E_COMP[ae];
940 let has_cross = te[j] >= 1;
941 let cross_coef = te[j] as f64 * inv_2pq;
942 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
943 let mtop = NF - 1;
944 for m in 0..=mtop {
945 let mut v = qcen[j] * ea[ae * sa + m] + wq[j] * ea[ae * sa + m + 1];
946 if has_cross {
947 v += cross_coef * ea[cs * sa + m + 1];
948 }
949 eb1[(ae * 3 + fw) * s1 + m] = v;
950 }
951 }
952 }
953 for ae in 0..n_e {
954 for fw in 0..3 {
955 c_ef[ae * n_f + 1 + fw] += scale * eb1[(ae * 3 + fw) * s1];
956 }
957 }
958 }
959
960 // Phase B — ket raise, f-degree 2: m ≤ NF − 2 (liveness bound, see above).
961 if NF >= 2 {
962 for (fw, tf) in TRI2.iter().enumerate() {
963 let j = lower_axis(*tf);
964 let f1 = dec(*tf, j);
965 let lf1 = cart_index(f1); // f-component index in the degree-1 level
966 let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
967 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
968 for ae in 0..n_e {
969 let (te, _) = E_COMP[ae];
970 let has_cross = te[j] >= 1;
971 let cross_coef = te[j] as f64 * inv_2pq;
972 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
973 let mtop = NF - 2;
974 for m in 0..=mtop {
975 let mut v = qcen[j] * eb1[(ae * 3 + lf1) * s1 + m]
976 + wq[j] * eb1[(ae * 3 + lf1) * s1 + m + 1];
977 if has2 {
978 v += coef2 * (ea[ae * sa + m] - p_over_pq * ea[ae * sa + m + 1]);
979 }
980 if has_cross {
981 v += cross_coef * eb1[(cs * 3 + lf1) * s1 + m + 1];
982 }
983 eb2[(ae * 6 + fw) * s2 + m] = v;
984 }
985 }
986 }
987 for ae in 0..n_e {
988 for fw in 0..6 {
989 c_ef[ae * n_f + 4 + fw] += scale * eb2[(ae * 6 + fw) * s2];
990 }
991 }
992 }
993
994 // Phase B — ket raise, f-degree 3: always the final f-level (NF ≤ 3), so
995 // only m = 0 is computed (same dead-row trim as above).
996 if NF >= 3 {
997 for (fw, tf) in TRI3.iter().enumerate() {
998 let j = lower_axis(*tf);
999 let f1 = dec(*tf, j);
1000 let lf1 = cart_index(f1); // f-component index in the degree-2 level
1001 let has2 = tf[j] >= 2; // its k−2 source is f1 lowered again (degree 1)
1002 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
1003 let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 };
1004 for ae in 0..n_e {
1005 let (te, _) = E_COMP[ae];
1006 let has_cross = te[j] >= 1;
1007 let cross_coef = te[j] as f64 * inv_2pq;
1008 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
1009 let mtop = 0; // final level: m = 0 only
1010 for m in 0..=mtop {
1011 let mut v = qcen[j] * eb2[(ae * 6 + lf1) * s2 + m]
1012 + wq[j] * eb2[(ae * 6 + lf1) * s2 + m + 1];
1013 if has2 {
1014 v += coef2
1015 * (eb1[(ae * 3 + lf2) * s1 + m]
1016 - p_over_pq * eb1[(ae * 3 + lf2) * s1 + m + 1]);
1017 }
1018 if has_cross {
1019 v += cross_coef * eb2[(cs * 6 + lf1) * s2 + m + 1];
1020 }
1021 eb3[(ae * 10 + fw) * s3 + m] = v;
1022 }
1023 }
1024 }
1025 for ae in 0..n_e {
1026 for fw in 0..10 {
1027 c_ef[ae * n_f + 10 + fw] += scale * eb3[(ae * 10 + fw) * s3];
1028 }
1029 }
1030 }
1031}
1032
1033// --- 4-lane batch across shell quartets (libint2-style class vectorization) ---
1034
1035/// Four-wide value: one shell quartet per lane. All lane arithmetic below is
1036/// elementwise, so each quartet's numbers never mix across lanes — the batch
1037/// path is bit-identical to four scalar evaluations by construction.
1038type V4 = [f64; 4];
1039
1040#[inline(always)]
1041fn v4_mul(a: V4, b: V4) -> V4 {
1042 [a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3]]
1043}
1044
1045#[inline(always)]
1046fn v4_add(a: V4, b: V4) -> V4 {
1047 [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
1048}
1049
1050#[inline(always)]
1051fn v4_sub(a: V4, b: V4) -> V4 {
1052 [a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3]]
1053}
1054
1055#[inline(always)]
1056fn v4_scale(s: f64, a: V4) -> V4 {
1057 [s * a[0], s * a[1], s * a[2], s * a[3]]
1058}
1059
1060/// Lane-major [`QuartetHeader`] for the 4-quartet batch kernel: the same scalar
1061/// header fields, one lane per quartet. Each lane is filled by the exact
1062/// expression sequence of the scalar header (see [`contract_class`]).
1063#[derive(Clone, Copy, Default)]
1064struct QuartetHeader4 {
1065 wp: [V4; 3],
1066 wq: [V4; 3],
1067 pref: V4,
1068 inv_2pq: V4,
1069 q_over_pq: V4,
1070 p_over_pq: V4,
1071 scale: V4,
1072 t: V4,
1073}
1074
1075/// Contract four shell quartets of the same VRR shape `(NE, NF)` (`NE, NF ≤ 3`)
1076/// in lockstep, one quartet per lane. Every lane must have the **same bra and
1077/// ket pair counts** (the caller buckets quartets that way), so the lanes run
1078/// the identical primitive loop with no padding — no `0·∞`/`-0.0` hazards.
1079///
1080/// Per lane the primitive order (bra outer, ket inner), header expressions,
1081/// Boys ladder ([`boys_array4`], lanes bitwise equal to [`boys_array`]) and the
1082/// VRR/accumulation arithmetic of [`vrr_small4`] are exactly the scalar
1083/// [`contract_class`]/[`vrr_small`] sequence, so each lane's `c_ef` is
1084/// bit-identical to a scalar evaluation of that quartet (fingerprint-guarded).
1085// `ib`/`ik` index four parallel pair slices in lockstep — a range loop is the
1086// clear form (no single iterable to zip without allocation).
1087#[allow(clippy::needless_range_loop)]
1088fn contract_class4<const NE: usize, const NF: usize>(
1089 bra_pairs: [&[PrimPair]; 4],
1090 ket_pairs: [&[PrimPair]; 4],
1091 c_ef: &mut [&mut [f64]; 4],
1092) {
1093 let lt = NE + NF;
1094 let n_bra = bra_pairs[0].len();
1095 let n_ket = ket_pairs[0].len();
1096 debug_assert!(bra_pairs.iter().all(|b| b.len() == n_bra));
1097 debug_assert!(ket_pairs.iter().all(|k| k.len() == n_ket));
1098 // Worst-case (NE,NF)=(3,3) sizes, the scalar `contract_class` tables widened
1099 // to V4 (~61 KB of stack total — fine on both the main thread and the
1100 // `EriBuilder` workers): e-table 20 components × m ∈ 0..=6; f-degree-1
1101 // level 20×3 × m ∈ 0..=5; f-degree-2 level 20×6 × m ∈ 0..=4; f-degree-3
1102 // level 20×10 × m ∈ 0..=3.
1103 let mut ea = [[0.0f64; 4]; 140];
1104 let mut eb1 = [[0.0f64; 4]; 360];
1105 let mut eb2 = [[0.0f64; 4]; 600];
1106 let mut eb3 = [[0.0f64; 4]; 800];
1107 let mut hdr = QuartetHeader4::default();
1108 let mut fm = [[0.0f64; 4]; 7];
1109 for ib in 0..n_bra {
1110 let bras = [
1111 &bra_pairs[0][ib],
1112 &bra_pairs[1][ib],
1113 &bra_pairs[2][ib],
1114 &bra_pairs[3][ib],
1115 ];
1116 let mut pa = [[0.0f64; 4]; 3];
1117 let mut inv_2p = [0.0f64; 4];
1118 let mut bra_coef = [0.0f64; 4];
1119 for (lane, bra) in bras.iter().enumerate() {
1120 for (dst, &src) in pa.iter_mut().zip(bra.r1.iter()) {
1121 dst[lane] = src;
1122 }
1123 inv_2p[lane] = bra.inv_2zeta;
1124 bra_coef[lane] = bra.c1 * bra.c2;
1125 }
1126 for ik in 0..n_ket {
1127 let kets = [
1128 &ket_pairs[0][ik],
1129 &ket_pairs[1][ik],
1130 &ket_pairs[2][ik],
1131 &ket_pairs[3][ik],
1132 ];
1133 let mut qcen = [[0.0f64; 4]; 3];
1134 let mut inv_2q = [0.0f64; 4];
1135 for (lane, (bra, ket)) in bras.iter().zip(kets.iter()).enumerate() {
1136 // The scalar header expressions of `contract_class`, verbatim,
1137 // one lane per quartet.
1138 let (p, pc) = (bra.zeta, bra.center);
1139 let (q, qc) = (ket.zeta, ket.center);
1140 let pq = p + q;
1141 let rho = p * q / pq;
1142 let pq_vec = sub(pc, qc); // P − Q
1143 hdr.t[lane] = rho * norm2(pq_vec);
1144 let w = [
1145 (p * pc[0] + q * qc[0]) / pq,
1146 (p * pc[1] + q * qc[1]) / pq,
1147 (p * pc[2] + q * qc[2]) / pq,
1148 ];
1149 let wp = sub(w, pc);
1150 let wq = sub(w, qc);
1151 for ax in 0..3 {
1152 hdr.wp[ax][lane] = wp[ax];
1153 hdr.wq[ax][lane] = wq[ax];
1154 qcen[ax][lane] = ket.r1[ax];
1155 }
1156 use std::f64::consts::PI;
1157 hdr.pref[lane] =
1158 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
1159 hdr.inv_2pq[lane] = 0.5 / pq;
1160 hdr.q_over_pq[lane] = q / pq;
1161 hdr.p_over_pq[lane] = p / pq;
1162 hdr.scale[lane] = (bra_coef[lane] * ket.c1) * ket.c2;
1163 inv_2q[lane] = ket.inv_2zeta;
1164 }
1165 boys_array4(lt, hdr.t, &mut fm[..=lt]);
1166 vrr_small4::<NE, NF>(
1167 &pa, &qcen, inv_2p, inv_2q, &hdr, &fm, &mut ea, &mut eb1, &mut eb2, &mut eb3, c_ef,
1168 );
1169 }
1170 }
1171}
1172
1173/// Four primitive quartets of the small-class OS VRR (`NE, NF ≤ 3`) in lockstep,
1174/// one quartet per [`V4`] lane — the 4-wide counterpart of [`vrr_small`].
1175///
1176/// **Bit-identical to [`vrr_small`] per lane by construction**: every statement
1177/// below is the scalar kernel's statement with each `f64` widened to a [`V4`]
1178/// whose lanes never interact — same expressions, same term order and
1179/// association, same m-ranges (the round-6/7 liveness trims included), and the
1180/// same per-lane `c_ef[ae·n_f + af] += scale·v` extraction order.
1181#[inline(always)]
1182#[allow(clippy::too_many_arguments)]
1183fn vrr_small4<const NE: usize, const NF: usize>(
1184 pa: &[V4; 3],
1185 qcen: &[V4; 3],
1186 inv_2p: V4,
1187 inv_2q: V4,
1188 hdr: &QuartetHeader4,
1189 fm: &[V4; 7],
1190 ea: &mut [V4; 140],
1191 eb1: &mut [V4; 360],
1192 eb2: &mut [V4; 600],
1193 eb3: &mut [V4; 800],
1194 c_ef: &mut [&mut [f64]; 4],
1195) {
1196 const { assert!(NE <= 3 && NF <= 3, "batch kernel covers NE, NF <= 3") };
1197 let lt = NE + NF;
1198 let n_e = n_addr(NE);
1199 let n_f = n_addr(NF);
1200 let sa = lt + 1;
1201 let s1 = lt;
1202 let s2 = lt.saturating_sub(1);
1203 let s3 = lt.saturating_sub(2);
1204 let QuartetHeader4 {
1205 wp,
1206 wq,
1207 pref,
1208 inv_2pq,
1209 q_over_pq,
1210 p_over_pq,
1211 scale,
1212 t: _,
1213 } = *hdr;
1214 for m in 0..=lt {
1215 ea[m] = v4_mul(pref, fm[m]);
1216 }
1217
1218 // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
1219 if NE >= 1 {
1220 for (iw, te) in TRI1.iter().enumerate() {
1221 let i = lower_axis(*te);
1222 for m in 0..=(lt - 1) {
1223 ea[(1 + iw) * sa + m] = v4_add(v4_mul(pa[i], ea[m]), v4_mul(wp[i], ea[m + 1]));
1224 }
1225 }
1226 }
1227 if NE >= 2 {
1228 for (iw, te) in TRI2.iter().enumerate() {
1229 let i = lower_axis(*te);
1230 let s1a = addr(dec(*te, i));
1231 let has2 = te[i] >= 2;
1232 let coef2 = v4_scale((te[i] - 1) as f64, inv_2p);
1233 let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
1234 for m in 0..=(lt - 2) {
1235 let mut v = v4_add(
1236 v4_mul(pa[i], ea[s1a * sa + m]),
1237 v4_mul(wp[i], ea[s1a * sa + m + 1]),
1238 );
1239 if has2 {
1240 v = v4_add(
1241 v,
1242 v4_mul(
1243 coef2,
1244 v4_sub(ea[s2a * sa + m], v4_mul(q_over_pq, ea[s2a * sa + m + 1])),
1245 ),
1246 );
1247 }
1248 ea[(4 + iw) * sa + m] = v;
1249 }
1250 }
1251 }
1252 if NE >= 3 {
1253 for (iw, te) in TRI3.iter().enumerate() {
1254 let i = lower_axis(*te);
1255 let s1a = addr(dec(*te, i));
1256 let has2 = te[i] >= 2;
1257 let coef2 = v4_scale((te[i] - 1) as f64, inv_2p);
1258 let s2a = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
1259 for m in 0..=(lt - 3) {
1260 let mut v = v4_add(
1261 v4_mul(pa[i], ea[s1a * sa + m]),
1262 v4_mul(wp[i], ea[s1a * sa + m + 1]),
1263 );
1264 if has2 {
1265 v = v4_add(
1266 v,
1267 v4_mul(
1268 coef2,
1269 v4_sub(ea[s2a * sa + m], v4_mul(q_over_pq, ea[s2a * sa + m + 1])),
1270 ),
1271 );
1272 }
1273 ea[(10 + iw) * sa + m] = v;
1274 }
1275 }
1276 }
1277 // Extract f-degree 0: contract m = 0 of every e-component, per lane.
1278 for ae in 0..n_e {
1279 let v = ea[ae * sa];
1280 for lane in 0..4 {
1281 c_ef[lane][ae * n_f] += scale[lane] * v[lane];
1282 }
1283 }
1284
1285 // Phase B — ket raise, f-degree k: liveness m-range `m ≤ NF − k` (see
1286 // [`vrr_small`]'s Phase B comment for the derivation).
1287 if NF >= 1 {
1288 for (fw, tf) in TRI1.iter().enumerate() {
1289 let j = lower_axis(*tf);
1290 for ae in 0..n_e {
1291 let (te, _) = E_COMP[ae];
1292 let has_cross = te[j] >= 1;
1293 let cross_coef = v4_scale(te[j] as f64, inv_2pq);
1294 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
1295 let mtop = NF - 1;
1296 for m in 0..=mtop {
1297 let mut v = v4_add(
1298 v4_mul(qcen[j], ea[ae * sa + m]),
1299 v4_mul(wq[j], ea[ae * sa + m + 1]),
1300 );
1301 if has_cross {
1302 v = v4_add(v, v4_mul(cross_coef, ea[cs * sa + m + 1]));
1303 }
1304 eb1[(ae * 3 + fw) * s1 + m] = v;
1305 }
1306 }
1307 }
1308 for ae in 0..n_e {
1309 for fw in 0..3 {
1310 let v = eb1[(ae * 3 + fw) * s1];
1311 for lane in 0..4 {
1312 c_ef[lane][ae * n_f + 1 + fw] += scale[lane] * v[lane];
1313 }
1314 }
1315 }
1316 }
1317
1318 if NF >= 2 {
1319 for (fw, tf) in TRI2.iter().enumerate() {
1320 let j = lower_axis(*tf);
1321 let f1 = dec(*tf, j);
1322 let lf1 = cart_index(f1);
1323 let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
1324 let coef2 = v4_scale((tf[j] - 1) as f64, inv_2q);
1325 for ae in 0..n_e {
1326 let (te, _) = E_COMP[ae];
1327 let has_cross = te[j] >= 1;
1328 let cross_coef = v4_scale(te[j] as f64, inv_2pq);
1329 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
1330 let mtop = NF - 2;
1331 for m in 0..=mtop {
1332 let mut v = v4_add(
1333 v4_mul(qcen[j], eb1[(ae * 3 + lf1) * s1 + m]),
1334 v4_mul(wq[j], eb1[(ae * 3 + lf1) * s1 + m + 1]),
1335 );
1336 if has2 {
1337 v = v4_add(
1338 v,
1339 v4_mul(
1340 coef2,
1341 v4_sub(ea[ae * sa + m], v4_mul(p_over_pq, ea[ae * sa + m + 1])),
1342 ),
1343 );
1344 }
1345 if has_cross {
1346 v = v4_add(v, v4_mul(cross_coef, eb1[(cs * 3 + lf1) * s1 + m + 1]));
1347 }
1348 eb2[(ae * 6 + fw) * s2 + m] = v;
1349 }
1350 }
1351 }
1352 for ae in 0..n_e {
1353 for fw in 0..6 {
1354 let v = eb2[(ae * 6 + fw) * s2];
1355 for lane in 0..4 {
1356 c_ef[lane][ae * n_f + 4 + fw] += scale[lane] * v[lane];
1357 }
1358 }
1359 }
1360 }
1361
1362 // Phase B — ket raise, f-degree 3: always the final f-level (NF ≤ 3), so
1363 // only m = 0 is computed (the scalar `vrr_small` block, widened to V4).
1364 if NF >= 3 {
1365 for (fw, tf) in TRI3.iter().enumerate() {
1366 let j = lower_axis(*tf);
1367 let f1 = dec(*tf, j);
1368 let lf1 = cart_index(f1); // f-component index in the degree-2 level
1369 let has2 = tf[j] >= 2; // its k−2 source is f1 lowered again (degree 1)
1370 let coef2 = v4_scale((tf[j] - 1) as f64, inv_2q);
1371 let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 };
1372 for ae in 0..n_e {
1373 let (te, _) = E_COMP[ae];
1374 let has_cross = te[j] >= 1;
1375 let cross_coef = v4_scale(te[j] as f64, inv_2pq);
1376 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
1377 let mtop = 0; // final level: m = 0 only
1378 for m in 0..=mtop {
1379 let mut v = v4_add(
1380 v4_mul(qcen[j], eb2[(ae * 6 + lf1) * s2 + m]),
1381 v4_mul(wq[j], eb2[(ae * 6 + lf1) * s2 + m + 1]),
1382 );
1383 if has2 {
1384 v = v4_add(
1385 v,
1386 v4_mul(
1387 coef2,
1388 v4_sub(
1389 eb1[(ae * 3 + lf2) * s1 + m],
1390 v4_mul(p_over_pq, eb1[(ae * 3 + lf2) * s1 + m + 1]),
1391 ),
1392 ),
1393 );
1394 }
1395 if has_cross {
1396 v = v4_add(v, v4_mul(cross_coef, eb2[(cs * 6 + lf1) * s2 + m + 1]));
1397 }
1398 eb3[(ae * 10 + fw) * s3 + m] = v;
1399 }
1400 }
1401 }
1402 for ae in 0..n_e {
1403 for fw in 0..10 {
1404 let v = eb3[(ae * 10 + fw) * s3];
1405 for lane in 0..4 {
1406 c_ef[lane][ae * n_f + 10 + fw] += scale[lane] * v[lane];
1407 }
1408 }
1409 }
1410 }
1411}
1412
1413/// Four `(ss|ss)` shell quartets in lockstep — the 4-lane counterpart of the
1414/// scalar `(ss|ss)` fast path in [`coulomb_shell_into_scratch`]: per lane the
1415/// same `t`/`pref` expressions, the same `((c_a·c_b)·c_c)·c_d` scale, the same
1416/// `scale·(pref·F_0)` term and the same bra-outer/ket-inner accumulation order
1417/// into a single per-lane register, so each lane is bit-identical to the
1418/// scalar path. Lanes must have equal bra/ket pair counts (caller-bucketed).
1419#[allow(clippy::needless_range_loop)] // ib/ik index four parallel slices in lockstep
1420fn contract_ssss4(
1421 bra_pairs: [&[PrimPair]; 4],
1422 ket_pairs: [&[PrimPair]; 4],
1423 outs: &mut [&mut [f64]; 4],
1424) {
1425 let n_bra = bra_pairs[0].len();
1426 let n_ket = ket_pairs[0].len();
1427 debug_assert!(bra_pairs.iter().all(|b| b.len() == n_bra));
1428 debug_assert!(ket_pairs.iter().all(|k| k.len() == n_ket));
1429 let two_pi_2_5 =
1430 2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
1431 let mut acc = [0.0f64; 4];
1432 let mut fm = [[0.0f64; 4]; 1];
1433 for ib in 0..n_bra {
1434 let bras = [
1435 &bra_pairs[0][ib],
1436 &bra_pairs[1][ib],
1437 &bra_pairs[2][ib],
1438 &bra_pairs[3][ib],
1439 ];
1440 let mut bc = [0.0f64; 4];
1441 for (lane, bra) in bras.iter().enumerate() {
1442 bc[lane] = bra.c1 * bra.c2;
1443 }
1444 for ik in 0..n_ket {
1445 let kets = [
1446 &ket_pairs[0][ik],
1447 &ket_pairs[1][ik],
1448 &ket_pairs[2][ik],
1449 &ket_pairs[3][ik],
1450 ];
1451 let mut t = [0.0f64; 4];
1452 let mut pref = [0.0f64; 4];
1453 for (lane, (bra, ket)) in bras.iter().zip(kets.iter()).enumerate() {
1454 let p = bra.zeta;
1455 let q = ket.zeta;
1456 let pq = p + q;
1457 t[lane] = (p * q / pq) * dist2(bra.center, ket.center);
1458 pref[lane] = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
1459 }
1460 boys_array4(0, t, &mut fm);
1461 for (lane, ket) in kets.iter().enumerate() {
1462 acc[lane] += (bc[lane] * ket.c1) * ket.c2 * (pref[lane] * fm[0][lane]);
1463 }
1464 }
1465 }
1466 for (lane, out) in outs.iter_mut().enumerate() {
1467 out[0] += acc[lane];
1468 }
1469}
1470
1471/// Number of primitive pairs of two shells that survive the
1472/// [`PAIR_NEGLIGIBLE`] screen — the pair count [`build_pairs`] would produce.
1473/// Drivers bucketing shell quartets for [`coulomb_shell_batch4_into_scratch`]
1474/// use this (once per shell pair) to group quartets whose lanes run the
1475/// primitive loop in true lockstep.
1476#[must_use]
1477pub fn surviving_pair_count(s1: ShellRef<'_>, s2: ShellRef<'_>) -> usize {
1478 let d2 = dist2(s1.center, s2.center);
1479 let mut n = 0;
1480 for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
1481 for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
1482 let zeta = e1 + e2;
1483 let kappa = (-(e1 * e2 / zeta) * d2).exp();
1484 if kappa * (c1 * c2).abs() >= PAIR_NEGLIGIBLE {
1485 n += 1;
1486 }
1487 }
1488 }
1489 n
1490}
1491
1492/// Reusable buffers for the 4-quartet batch entry
1493/// [`coulomb_shell_batch4_into_scratch`]: per-lane pair lists and `c_ef`
1494/// accumulators, plus a scalar [`EriScratch`] for the per-lane HRR (and the
1495/// defensive scalar fallback).
1496#[derive(Debug, Default)]
1497pub struct EriBatch4Scratch {
1498 bra_pairs: [Vec<PrimPair>; 4],
1499 ket_pairs: [Vec<PrimPair>; 4],
1500 c_ef: [Vec<f64>; 4],
1501 scalar: EriScratch,
1502}
1503
1504/// Evaluate **four shell quartets in lockstep** — `quartets[lane] = [a, b, c, d]`
1505/// — accumulating each lane's contracted Cartesian block into `outs[lane]`
1506/// (same layout/contract as [`coulomb_shell_into`]).
1507///
1508/// All four quartets must share the VRR shape `(ne, nf) = (la+lb, lc+ld)` with
1509/// `ne, nf ≤ 3`, and have equal surviving bra and ket pair
1510/// counts (see [`surviving_pair_count`]) so the lanes run the primitive loop in
1511/// true lockstep. Quartets violating that (which a class-bucketing driver never
1512/// sends) are evaluated through the scalar path instead — same values either
1513/// way: each lane's result is **bit-identical** to a [`coulomb_shell_into`]
1514/// call for that quartet.
1515///
1516/// # Panics
1517/// Panics (in debug builds) if any `l > MAX_L` or any `outs[lane]` is too short.
1518pub fn coulomb_shell_batch4_into_scratch(
1519 scratch: &mut EriBatch4Scratch,
1520 quartets: &[[ShellRef<'_>; 4]; 4],
1521 outs: &mut [&mut [f64]; 4],
1522) {
1523 // Build the per-lane pair lists into the reused buffers (moved out for the
1524 // duration of the core call, like the scalar entry), then run the shared
1525 // core on borrowed slices.
1526 let mut bra_pairs = std::mem::take(&mut scratch.bra_pairs);
1527 let mut ket_pairs = std::mem::take(&mut scratch.ket_pairs);
1528 for (lane, &[a, b, c, d]) in quartets.iter().enumerate() {
1529 build_pairs(&mut bra_pairs[lane], a, b);
1530 build_pairs(&mut ket_pairs[lane], c, d);
1531 }
1532 let bra = [
1533 &bra_pairs[0][..],
1534 &bra_pairs[1][..],
1535 &bra_pairs[2][..],
1536 &bra_pairs[3][..],
1537 ];
1538 let ket = [
1539 &ket_pairs[0][..],
1540 &ket_pairs[1][..],
1541 &ket_pairs[2][..],
1542 &ket_pairs[3][..],
1543 ];
1544 coulomb_shell_batch4_core(scratch, quartets, bra, ket, outs);
1545 scratch.bra_pairs = bra_pairs;
1546 scratch.ket_pairs = ket_pairs;
1547}
1548
1549/// Like [`coulomb_shell_batch4_into_scratch`] but with each lane's bra/ket
1550/// primitive-pair data supplied by the caller (precomputed once per shell pair
1551/// across a dense build, see [`ShellPairData`]). Bit-identical to the
1552/// self-building entry: same pair values and order, only computed elsewhere.
1553///
1554/// # Panics
1555/// Panics (in debug builds) if any `l > MAX_L` or any `outs[lane]` is too short.
1556pub fn coulomb_shell_batch4_pairs_into_scratch(
1557 scratch: &mut EriBatch4Scratch,
1558 quartets: &[[ShellRef<'_>; 4]; 4],
1559 bra_pairs: [&ShellPairData; 4],
1560 ket_pairs: [&ShellPairData; 4],
1561 outs: &mut [&mut [f64]; 4],
1562) {
1563 coulomb_shell_batch4_core(
1564 scratch,
1565 quartets,
1566 bra_pairs.map(|p| &p.pairs[..]),
1567 ket_pairs.map(|p| &p.pairs[..]),
1568 outs,
1569 );
1570}
1571
1572/// The 4-lane batch kernel shared by the self-building and borrowed-pairs
1573/// entries. Lanes that violate the lockstep contract (mismatched VRR shape,
1574/// `ne`/`nf > 3`, or unequal pair counts — which a class-bucketing driver never
1575/// sends) drain through the scalar core with the same pair lists.
1576fn coulomb_shell_batch4_core(
1577 scratch: &mut EriBatch4Scratch,
1578 quartets: &[[ShellRef<'_>; 4]; 4],
1579 bra: [&[PrimPair]; 4],
1580 ket: [&[PrimPair]; 4],
1581 outs: &mut [&mut [f64]; 4],
1582) {
1583 let [a0, b0, c0, d0] = quartets[0];
1584 let ne = a0.l + b0.l;
1585 let nf = c0.l + d0.l;
1586 let lanes_match = quartets
1587 .iter()
1588 .all(|[a, b, c, d]| a.l + b.l == ne && c.l + d.l == nf);
1589 let n_bra = bra[0].len();
1590 let n_ket = ket[0].len();
1591 let lockstep = lanes_match
1592 && ne <= 3
1593 && nf <= 3
1594 && bra.iter().all(|p| p.len() == n_bra)
1595 && ket.iter().all(|p| p.len() == n_ket);
1596 if !lockstep {
1597 for (lane, &[a, b, c, d]) in quartets.iter().enumerate() {
1598 coulomb_shell_core(
1599 &mut scratch.scalar,
1600 a,
1601 b,
1602 c,
1603 d,
1604 bra[lane],
1605 ket[lane],
1606 outs[lane],
1607 );
1608 }
1609 return;
1610 }
1611
1612 // (ss|ss): the 4-lane fast path (no VRR/HRR scaffolding, like the scalar
1613 // fast path).
1614 if ne + nf == 0 {
1615 contract_ssss4(bra, ket, outs);
1616 return;
1617 }
1618
1619 let n_e = n_addr(ne);
1620 let n_f = n_addr(nf);
1621 for lane in 0..4 {
1622 ensure_len(&mut scratch.c_ef[lane], n_e * n_f);
1623 scratch.c_ef[lane][..n_e * n_f].fill(0.0);
1624 }
1625 {
1626 let [c0, c1, c2, c3] = &mut scratch.c_ef;
1627 let mut c_ef: [&mut [f64]; 4] = [
1628 &mut c0[..n_e * n_f],
1629 &mut c1[..n_e * n_f],
1630 &mut c2[..n_e * n_f],
1631 &mut c3[..n_e * n_f],
1632 ];
1633 match (ne, nf) {
1634 (0, 1) => contract_class4::<0, 1>(bra, ket, &mut c_ef),
1635 (1, 0) => contract_class4::<1, 0>(bra, ket, &mut c_ef),
1636 (1, 1) => contract_class4::<1, 1>(bra, ket, &mut c_ef),
1637 (0, 2) => contract_class4::<0, 2>(bra, ket, &mut c_ef),
1638 (2, 0) => contract_class4::<2, 0>(bra, ket, &mut c_ef),
1639 (1, 2) => contract_class4::<1, 2>(bra, ket, &mut c_ef),
1640 (2, 1) => contract_class4::<2, 1>(bra, ket, &mut c_ef),
1641 (2, 2) => contract_class4::<2, 2>(bra, ket, &mut c_ef),
1642 (0, 3) => contract_class4::<0, 3>(bra, ket, &mut c_ef),
1643 (3, 0) => contract_class4::<3, 0>(bra, ket, &mut c_ef),
1644 (1, 3) => contract_class4::<1, 3>(bra, ket, &mut c_ef),
1645 (3, 1) => contract_class4::<3, 1>(bra, ket, &mut c_ef),
1646 (2, 3) => contract_class4::<2, 3>(bra, ket, &mut c_ef),
1647 (3, 2) => contract_class4::<3, 2>(bra, ket, &mut c_ef),
1648 (3, 3) => contract_class4::<3, 3>(bra, ket, &mut c_ef),
1649 _ => unreachable!("guarded above"),
1650 }
1651 }
1652
1653 // Per-lane HRR + scatter, exactly the scalar tail of
1654 // [`coulomb_shell_into_scratch`].
1655 let EriScratch {
1656 bra,
1657 bra_prev,
1658 bra_cur,
1659 ket_prev,
1660 ket_cur,
1661 tri_all,
1662 ..
1663 } = &mut scratch.scalar;
1664 if tri_all.len() <= 2 * MAX_L {
1665 *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
1666 }
1667 for (lane, &[a, b, c, d]) in quartets.iter().enumerate() {
1668 let ab = sub(a.center, b.center);
1669 let cd = sub(c.center, d.center);
1670 hrr_and_scatter(
1671 a.l,
1672 b.l,
1673 c.l,
1674 d.l,
1675 n_f,
1676 &scratch.c_ef[lane],
1677 ab,
1678 cd,
1679 outs[lane],
1680 bra,
1681 bra_prev,
1682 bra_cur,
1683 ket_prev,
1684 ket_cur,
1685 &tri_all[..],
1686 );
1687 }
1688}
1689
1690/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
1691/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
1692///
1693/// Algebraically identical to the former full-table VRR (same OS recurrences, same
1694/// term order) — only the storage changes, so it is bit-identical
1695/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
1696/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
1697/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
1698/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
1699/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
1700/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
1701/// buffer is recycled.
1702#[allow(clippy::too_many_arguments)]
1703fn vrr_primitive(
1704 p: f64,
1705 q: f64,
1706 pc: Vec3,
1707 qc: Vec3,
1708 kab: f64,
1709 kcd: f64,
1710 pa: Vec3, // P − A (bra pair, precomputed)
1711 qcen: Vec3, // Q − C (ket pair, precomputed)
1712 inv_2p: f64, // 1/(2p) (bra pair, precomputed)
1713 inv_2q: f64, // 1/(2q) (ket pair, precomputed)
1714 ne: usize,
1715 nf: usize,
1716 l_total: usize,
1717 n_e: usize,
1718 n_f: usize,
1719 maxlevel: usize,
1720 eoff: &[usize],
1721 slab: &[usize],
1722 tri: &[Vec<[usize; 3]>],
1723 levels: &mut [f64],
1724 scale: f64,
1725 c_ef: &mut [f64],
1726) {
1727 let pq = p + q;
1728 let rho = p * q / pq;
1729 let pq_vec = sub(pc, qc); // P − Q
1730 let t = rho * norm2(pq_vec);
1731 let w = [
1732 (p * pc[0] + q * qc[0]) / pq,
1733 (p * pc[1] + q * qc[1]) / pq,
1734 (p * pc[2] + q * qc[2]) / pq,
1735 ];
1736 // `pa` (P−A) and `qcen` (Q−C) are now passed in (pure per-pair).
1737 let wp = sub(w, pc); // W − P
1738 let wq = sub(w, qc); // W − Q
1739
1740 // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
1741 // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
1742 // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
1743 // source and destination across the distinct slots `k`, `k−1`, `k−2`.
1744 let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
1745 (k % 3) * maxlevel + lf * slab[k] + eoff[k * n_e + ae] + m
1746 };
1747
1748 // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
1749 // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
1750 // bound; an ERI couples both electrons).
1751 use std::f64::consts::PI;
1752 let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
1753 let mut fm = [0.0f64; 4 * MAX_L + 1];
1754 boys_array(l_total, t, &mut fm[..=l_total]);
1755 for m in 0..=l_total {
1756 levels[off(0, 0, 0, m)] = pref * fm[m];
1757 }
1758
1759 // `inv_2p` / `inv_2q` are now passed in (pure per-pair).
1760 let inv_2pq = 0.5 / pq;
1761 let q_over_pq = q / pq;
1762 let p_over_pq = p / pq;
1763
1764 // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
1765 // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
1766 // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
1767 let eoff0 = &eoff[..n_e];
1768 for (na, te_list) in tri.iter().enumerate().take(ne + 1).skip(1) {
1769 for &te in te_list {
1770 let i = lower_axis(te);
1771 let s1a = addr(dec(te, i));
1772 let mmax = l_total - na;
1773 let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
1774 let has2 = te[i] >= 2;
1775 let s1_0 = eoff0[s1a];
1776 let s2_0 = if has2 {
1777 eoff0[addr(dec(dec(te, i), i))]
1778 } else {
1779 0
1780 };
1781 let dst_0 = eoff0[addr(te)];
1782 for m in 0..=mmax {
1783 let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
1784 if has2 {
1785 v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
1786 }
1787 levels[dst_0 + m] = v;
1788 }
1789 }
1790 }
1791 // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
1792 for ae in 0..n_e {
1793 c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
1794 }
1795
1796 // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
1797 // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
1798 // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
1799 // the former full-table inner-loop cost. The k−2 level is read only when the axis
1800 // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
1801 for k in 1..=nf {
1802 let slot_k = (k % 3) * maxlevel;
1803 let slot_k1 = ((k - 1) % 3) * maxlevel;
1804 let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
1805 let eoff_k = &eoff[k * n_e..];
1806 let eoff_k1 = &eoff[(k - 1) * n_e..];
1807 let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
1808 (
1809 ((k - 2) % 3) * maxlevel,
1810 slab[k - 2],
1811 &eoff[(k - 2) * n_e..],
1812 )
1813 } else {
1814 (0, 0, &[])
1815 };
1816 for &tf in &tri[k] {
1817 let j = lower_axis(tf);
1818 let f1 = dec(tf, j);
1819 let lf1 = cart_index(f1); // local index in level k−1
1820 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
1821 let has2 = tf[j] >= 2;
1822 let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
1823 let lf = cart_index(tf); // local index in level k
1824 let sk = slot_k + lf * slab_k;
1825 let sk1 = slot_k1 + lf1 * slab_k1;
1826 let sk2 = slot_k2 + lf2 * slab_k2;
1827 for (nadeg, te_list) in tri.iter().enumerate().take(ne + 1) {
1828 for &te in te_list {
1829 let ea = addr(te);
1830 // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
1831 let has_cross = te[j] >= 1;
1832 let (cross_coef, cross_0) = if has_cross {
1833 (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
1834 } else {
1835 (0.0, 0)
1836 };
1837 let mmax = l_total - nadeg - k;
1838 let dst_0 = sk + eoff_k[ea];
1839 let src1_0 = sk1 + eoff_k1[ea];
1840 let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
1841 for m in 0..=mmax {
1842 let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
1843 if has2 {
1844 v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
1845 }
1846 if has_cross {
1847 v += cross_coef * levels[cross_0 + m + 1];
1848 }
1849 levels[dst_0 + m] = v;
1850 }
1851 }
1852 }
1853 }
1854 // Extract level k: contract each f-component's m = 0 slice into c_ef.
1855 for &tf in &tri[k] {
1856 let lf = cart_index(tf);
1857 let af = addr(tf);
1858 for ae in 0..n_e {
1859 c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
1860 }
1861 }
1862 }
1863}
1864
1865/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
1866/// the row-major output block.
1867///
1868/// Flat-array HGP horizontal recurrence, replacing the earlier
1869/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
1870/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
1871/// term order — so it is **bit-identical** to the recursive version (guarded by
1872/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
1873/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
1874///
1875/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
1876/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
1877/// independently per ket index `f` (a spectator), into the `bra` intermediate.
1878/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
1879/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
1880/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
1881/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
1882#[allow(clippy::too_many_arguments)]
1883fn hrr_and_scatter<'a>(
1884 la: usize,
1885 lb: usize,
1886 lc: usize,
1887 ld: usize,
1888 n_f: usize,
1889 c_ef: &[f64],
1890 ab: Vec3,
1891 cd: Vec3,
1892 out: &mut [f64],
1893 bra: &mut Vec<f64>,
1894 mut prev: &'a mut Vec<f64>,
1895 mut cur: &'a mut Vec<f64>,
1896 mut kprev: &'a mut Vec<f64>,
1897 mut kcur: &'a mut Vec<f64>,
1898 // Shared Cartesian-triple table (degrees `0..=2·MAX_L`); HRR indexes it up to
1899 // `max(ne, nf)`. Passed in so it is not re-`collect`ed per quartet.
1900 tri: &[Vec<[usize; 3]>],
1901) {
1902 let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
1903 let ne = la + lb; // max bra (A-side) degree present in c_ef
1904 let nf = lc + ld; // max ket (C-side) degree present in c_ef
1905 let n_e = n_addr(ne);
1906
1907 // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
1908 // the ket base are keyed by the global `addr(f)` offset by `f_base`.
1909 let f_base = tri_below(lc);
1910 let nf_range = n_f - f_base;
1911
1912 // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
1913 // Reused arena buffers; bra is fully overwritten below, the rolling
1914 // layers in the region read — debug NaN-fill bra so a missed write surfaces.
1915 let bra_len = na * nb * nf_range;
1916 ensure_len(bra, bra_len);
1917 #[cfg(debug_assertions)]
1918 bra[..bra_len].fill(f64::NAN);
1919 // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
1920 let layer_len = n_cart(lb) * n_e;
1921 ensure_len(prev, layer_len);
1922 ensure_len(cur, layer_len);
1923 for f_global in f_base..n_f {
1924 let jf = f_global - f_base;
1925 // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
1926 for &a in tri[la..=ne].iter().flatten() {
1927 let ae = addr(a);
1928 prev[ae] = c_ef[ae * n_f + f_global];
1929 }
1930 for kb in 1..=lb {
1931 for (ibw, &b) in tri[kb].iter().enumerate() {
1932 let i = lower_axis(b);
1933 let b1w = cart_index(dec(b, i));
1934 // a-degrees that can still reach (la, lb): [la..=ne−kb].
1935 for &a in tri[la..=(ne - kb)].iter().flatten() {
1936 let ae = addr(a);
1937 let a1e = addr(inc(a, i));
1938 cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
1939 }
1940 }
1941 std::mem::swap(&mut prev, &mut cur);
1942 }
1943 // `prev` now holds b-degree lb (or the base when lb = 0): extract.
1944 for (ib, &b) in tri[lb].iter().enumerate() {
1945 let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
1946 for (ia, &a) in tri[la].iter().enumerate() {
1947 bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
1948 }
1949 }
1950 }
1951
1952 // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
1953 let klayer_len = n_cart(ld) * n_f;
1954 ensure_len(kprev, klayer_len);
1955 ensure_len(kcur, klayer_len);
1956 for ia in 0..na {
1957 for ib in 0..nb {
1958 let brarow = (ia * nb + ib) * nf_range;
1959 // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
1960 for &c in tri[lc..=nf].iter().flatten() {
1961 let ce = addr(c);
1962 kprev[ce] = bra[brarow + (ce - f_base)];
1963 }
1964 for kd in 1..=ld {
1965 for (idw, &d) in tri[kd].iter().enumerate() {
1966 let j = lower_axis(d);
1967 let d1w = cart_index(dec(d, j));
1968 for &c in tri[lc..=(nf - kd)].iter().flatten() {
1969 let ce = addr(c);
1970 let c1e = addr(inc(c, j));
1971 kcur[idw * n_f + ce] =
1972 kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
1973 }
1974 }
1975 std::mem::swap(&mut kprev, &mut kcur);
1976 }
1977 // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
1978 for (ic, &c) in tri[lc].iter().enumerate() {
1979 let ce = addr(c);
1980 for id in 0..nd {
1981 out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
1982 }
1983 }
1984 }
1985 }
1986}
1987
1988// --- small helpers ---
1989
1990/// First nonzero axis of a triple (the lowering direction).
1991#[inline]
1992fn lower_axis(t: [usize; 3]) -> usize {
1993 if t[0] > 0 {
1994 0
1995 } else if t[1] > 0 {
1996 1
1997 } else {
1998 2
1999 }
2000}
2001
2002#[inline]
2003fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
2004 t[i] -= 1;
2005 t
2006}
2007
2008#[inline]
2009fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
2010 t[i] += 1;
2011 t
2012}
2013
2014#[inline]
2015fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
2016 [
2017 (a * ca[0] + b * cb[0]) / p,
2018 (a * ca[1] + b * cb[1]) / p,
2019 (a * ca[2] + b * cb[2]) / p,
2020 ]
2021}
2022
2023#[inline]
2024fn sub(u: Vec3, v: Vec3) -> Vec3 {
2025 [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
2026}
2027
2028#[inline]
2029fn dist2(u: Vec3, v: Vec3) -> f64 {
2030 norm2(sub(u, v))
2031}
2032
2033#[inline]
2034fn norm2(u: Vec3) -> f64 {
2035 u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
2036}
2037
2038#[cfg(test)]
2039mod tests {
2040 use super::*;
2041
2042 /// Single-primitive `ShellRef` helper.
2043 fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
2044 (center, l, [exp], [1.0])
2045 }
2046
2047 /// (ss|ss) over four unit s primitives must equal the closed form
2048 /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
2049 /// engine passes, so the two share a verified base case.
2050 #[test]
2051 fn ssss_matches_closed_form() {
2052 let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
2053 let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
2054 let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
2055 let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
2056 let mut out = [0.0; 1];
2057 coulomb_shell_into(
2058 ShellRef {
2059 center: ac,
2060 l: al,
2061 exps: &ae,
2062 coeffs: &acf,
2063 },
2064 ShellRef {
2065 center: bc,
2066 l: bl,
2067 exps: &be,
2068 coeffs: &bcf,
2069 },
2070 ShellRef {
2071 center: cc,
2072 l: cl,
2073 exps: &ce,
2074 coeffs: &ccf,
2075 },
2076 ShellRef {
2077 center: dc,
2078 l: dl,
2079 exps: &de,
2080 coeffs: &dcf,
2081 },
2082 &mut out,
2083 );
2084
2085 let p = 0.8 + 1.3;
2086 let q = 0.5 + 1.1;
2087 let pcen = combine(0.8, ac, 1.3, bc, p);
2088 let qcen = combine(0.5, cc, 1.1, dc, q);
2089 let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
2090 let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
2091 let rho = p * q / (p + q);
2092 let t = rho * dist2(pcen, qcen);
2093 let mut fm = [0.0; 1];
2094 boys_array(0, t, &mut fm);
2095 use std::f64::consts::PI;
2096 let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
2097 assert!(
2098 (out[0] - expect).abs() < 1e-14 * expect.abs(),
2099 "ssss {} vs {}",
2100 out[0],
2101 expect
2102 );
2103 }
2104
2105 use crate::os::Prim;
2106 use crate::rys::coulomb_into;
2107
2108 /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
2109 /// center)`, for cross-checking against the Rys engine.
2110 #[allow(clippy::too_many_arguments)]
2111 fn os_block(
2112 la: usize,
2113 ea: f64,
2114 ca: Vec3,
2115 lb: usize,
2116 eb: f64,
2117 cb: Vec3,
2118 lc: usize,
2119 recc: f64,
2120 ccc: Vec3,
2121 ld: usize,
2122 ed: f64,
2123 cdd: Vec3,
2124 ) -> Vec<f64> {
2125 let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
2126 let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
2127 let one = [1.0];
2128 coulomb_shell_into(
2129 ShellRef {
2130 center: ca,
2131 l: la,
2132 exps: &ea1,
2133 coeffs: &one,
2134 },
2135 ShellRef {
2136 center: cb,
2137 l: lb,
2138 exps: &eb1,
2139 coeffs: &one,
2140 },
2141 ShellRef {
2142 center: ccc,
2143 l: lc,
2144 exps: &ec1,
2145 coeffs: &one,
2146 },
2147 ShellRef {
2148 center: cdd,
2149 l: ld,
2150 exps: &ed1,
2151 coeffs: &one,
2152 },
2153 &mut out,
2154 );
2155 out
2156 }
2157
2158 /// OS/HGP must reproduce the Rys engine element-for-element on a single
2159 /// primitive quartet, across a sweep of angular momenta (including the
2160 /// bug-prone mixed-high-L and "all four different L on four centers" cases).
2161 #[test]
2162 fn matches_rys_engine_primitive_sweep() {
2163 let ca = [0.0, 0.0, 0.0];
2164 let cb = [0.5, -0.3, 0.2];
2165 let cc = [-0.4, 0.6, -0.1];
2166 let cd = [0.2, 0.4, 0.8];
2167 let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);
2168
2169 let quartets = [
2170 (0usize, 0usize, 0usize, 0usize),
2171 (1, 0, 0, 0),
2172 (0, 0, 1, 0),
2173 (1, 1, 0, 0),
2174 (1, 0, 1, 0),
2175 (1, 1, 1, 1),
2176 (2, 0, 0, 0),
2177 (2, 1, 0, 0),
2178 (2, 1, 2, 1),
2179 (0, 1, 2, 3), // four different L on four centers
2180 (2, 2, 3, 3), // (dd|ff) mixed high-L
2181 (3, 0, 0, 1),
2182 // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
2183 // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
2184 // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
2185 (4, 4, 4, 1), // l_total = 13
2186 (6, 6, 1, 0), // l_total = 13, two i-shells
2187 (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
2188 ];
2189 for (la, lb, lc, ld) in quartets {
2190 let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
2191 let mut rys = vec![0.0; os.len()];
2192 coulomb_into(
2193 Prim::new(ea, ca, la),
2194 Prim::new(eb, cb, lb),
2195 Prim::new(ec, cc, lc),
2196 Prim::new(ed, cd, ld),
2197 1.0,
2198 &mut rys,
2199 );
2200 assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
2201 }
2202 }
2203
2204 /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
2205 /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
2206 /// The atol floor absorbs benign near-cancellation on structurally tiny
2207 /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
2208 /// even though both engines are correct — their dominant elements agree to
2209 /// ~1e-12); the rtol catches any real divergence.
2210 fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
2211 const ATOL: f64 = 1e-11;
2212 const RTOL: f64 = 1e-10;
2213 for (o, r) in os.iter().zip(rys.iter()) {
2214 assert!(
2215 (o - r).abs() <= ATOL + RTOL * r.abs(),
2216 "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
2217 (o - r).abs()
2218 );
2219 }
2220 }
2221
2222 /// HGP early contraction (VRR per primitive, HRR once in contracted space)
2223 /// must equal the per-primitive Rys sum for a genuinely **contracted**
2224 /// quartet.
2225 ///
2226 /// Note: doing the HRR after contraction is a *performance*
2227 /// choice, **not** a correctness fork. The HRR operator is linear with
2228 /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
2229 /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
2230 /// and post-contraction HRR give the *same* answer. This remains a valuable
2231 /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
2232 /// coefficient, or a wrong HRR sign would all fail it), but it does not
2233 /// distinguish HRR *ordering*, because the two orders are algebraically equal.
2234 #[test]
2235 fn contracted_quartet_matches_rys_sum() {
2236 // Two p shells and a d shell, each with multiple primitives, on distinct
2237 // centres (so HRR shift vectors A−B, C−D are all non-trivial).
2238 let ca = [0.0, 0.0, 0.0];
2239 let cb = [0.6, -0.2, 0.1];
2240 let cc = [-0.3, 0.5, -0.2];
2241 let cd = [0.2, 0.3, 0.7];
2242 let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
2243 let ax = [1.4, 0.45];
2244 let acf = [0.6, 0.5];
2245 let bx = [0.9, 0.3];
2246 let bcf = [0.55, 0.5];
2247 let cx = [1.1, 0.4];
2248 let ccf = [0.7, 0.4];
2249 let dx = [0.8];
2250 let dcf = [1.0];
2251
2252 let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
2253 coulomb_shell_into(
2254 ShellRef {
2255 center: ca,
2256 l: la,
2257 exps: &ax,
2258 coeffs: &acf,
2259 },
2260 ShellRef {
2261 center: cb,
2262 l: lb,
2263 exps: &bx,
2264 coeffs: &bcf,
2265 },
2266 ShellRef {
2267 center: cc,
2268 l: lc,
2269 exps: &cx,
2270 coeffs: &ccf,
2271 },
2272 ShellRef {
2273 center: cd,
2274 l: ld,
2275 exps: &dx,
2276 coeffs: &dcf,
2277 },
2278 &mut os,
2279 );
2280
2281 // Reference: sum the Rys primitive engine over the quartet with the same
2282 // effective coefficients.
2283 let mut rys = vec![0.0; os.len()];
2284 for (&ea, &wa) in ax.iter().zip(acf.iter()) {
2285 for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
2286 for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
2287 for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
2288 coulomb_into(
2289 Prim::new(ea, ca, la),
2290 Prim::new(eb, cb, lb),
2291 Prim::new(ec, cc, lc),
2292 Prim::new(ed, cd, ld),
2293 wa * wb * wc * wd,
2294 &mut rys,
2295 );
2296 }
2297 }
2298 }
2299 }
2300
2301 for (o, r) in os.iter().zip(rys.iter()) {
2302 assert!(
2303 (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
2304 "contracted OS vs Rys mismatch: {o} vs {r}"
2305 );
2306 }
2307 }
2308
2309 /// The triple-address map must be a bijection onto `0..n_addr(L)`.
2310 #[test]
2311 fn addr_is_bijective() {
2312 for lmax in 0..=6 {
2313 let mut seen = vec![false; n_addr(lmax)];
2314 for n in 0..=lmax {
2315 for t in cart_components(n) {
2316 let a = addr(t);
2317 assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
2318 assert!(!seen[a], "addr collision at {t:?}");
2319 seen[a] = true;
2320 }
2321 }
2322 assert!(
2323 seen.iter().all(|&x| x),
2324 "addr not surjective at lmax {lmax}"
2325 );
2326 }
2327 }
2328}