integral_core/os_eri.rs
1//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
2//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
3//!
4//! A **vertical recurrence (VRR)** builds the
5//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
6//! the primitives are **contracted** into AO space, and a **horizontal recurrence
7//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
8//! Doing the HRR after contraction is the HGP "early contraction" trick: the
9//! geometry-only HRR runs **once per shell quartet** instead of once per
10//! primitive quartet, which is the win at high contraction degree.
11//!
12//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
13//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
14//! against it (and against an independent McMurchie–Davidson path).
15//!
16//! ## Method
17//!
18//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
19//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
20//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
21//!
22//! ```text
23//! [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
24//! ```
25//!
26//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
27//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
28//!
29//! ```text
30//! [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
31//! + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
32//! + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
33//! [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
34//! + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
35//! + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
36//! ```
37//!
38//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
39//! Cartesian shell block:
40//!
41//! ```text
42//! (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0) [bra, A→B]
43//! (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d) [ket, C→D]
44//! ```
45//!
46//! ## Buffers (see `DESIGN_NOTES.md` D3, D12)
47//!
48//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
49//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
50//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
51//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
52//! `(ii|ii)`. The current engine (D12) removes that:
53//!
54//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
55//! `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
56//! of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
57//! index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
58//! `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
59//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
60//! / [`cart_index`] with two rolling degree layers, replacing the former HashMap
61//! memoisation.
62//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
63//! across quartets (thread-local by default), not re-allocated per quartet.
64//!
65//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
66//! former full-table engine's values, cross-checked against the Rys engine and an
67//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).
68
69use std::cell::RefCell;
70
71use integral_math::am::{cart_components, cart_index, n_cart};
72use integral_math::boys::boys_array;
73
74use crate::os::{Vec3, MAX_L};
75
76/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
77/// momentum, and primitive `(exponent, effective-coefficient)` data. The
78/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
79/// engine itself works on un-normalised monomials and only multiplies the four
80/// coefficients into the contracted accumulator.
81#[derive(Debug, Clone, Copy)]
82pub struct ShellRef<'a> {
83 /// Shell centre (bohr).
84 pub center: Vec3,
85 /// Angular momentum.
86 pub l: usize,
87 /// Primitive exponents.
88 pub exps: &'a [f64],
89 /// Effective contraction coefficients, aligned with `exps`.
90 pub coeffs: &'a [f64],
91}
92
93/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
94#[inline]
95fn n_addr(lmax: usize) -> usize {
96 (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
97}
98
99/// Number of Cartesian triples of degree **strictly less than** `d`:
100/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
101/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
102#[inline]
103fn tri_below(d: usize) -> usize {
104 d * (d + 1) * (d + 2) / 6
105}
106
107/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
108/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
109/// `0..n_addr(deg)`.
110#[inline]
111fn addr(t: [usize; 3]) -> usize {
112 let n = t[0] + t[1] + t[2];
113 let base = n * (n + 1) * (n + 2) / 6;
114 // within-degree index: (n-lx)(n-lx+1)/2 + lz
115 let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
116 base + within
117}
118
119/// Reusable scratch arena for the OS/HGP ERI engine.
120///
121/// Holds the engine's working buffers so they are allocated **once and reused**
122/// across shell quartets, instead of a fresh heap allocation per quartet inside the
123/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
124/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
125/// thread uses its own instance — the thread-local default behind
126/// [`coulomb_shell_into`], or one passed explicitly to
127/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
128/// error).
129///
130/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
131/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
132/// they are later read, so they need no functional zeroing — but in debug builds
133/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
134/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
135/// rather than silently reading a stale value from a previous quartet. Results are
136/// therefore independent of evaluation order and of arena reuse (asserted by
137/// `tests/arena.rs`).
138#[derive(Debug, Default, Clone)]
139pub struct EriScratch {
140 /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
141 levels: Vec<f64>,
142 /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
143 c_ef: Vec<f64>,
144 /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
145 bra: Vec<f64>,
146 /// HRR bra rolling `b`-degree layers.
147 bra_prev: Vec<f64>,
148 bra_cur: Vec<f64>,
149 /// HRR ket rolling `d`-degree layers.
150 ket_prev: Vec<f64>,
151 ket_cur: Vec<f64>,
152 /// Precomputed bra/ket primitive-pair data (combined exponent, product
153 /// centre, `K` prefactor, coeff product) — built once per shell quartet so
154 /// the per-pair `exp`/centre work is not repeated inside the deep primitive
155 /// loop. Reused across quartets like the other buffers.
156 bra_pairs: Vec<PrimPair>,
157 ket_pairs: Vec<PrimPair>,
158 /// Shared Cartesian-triple table `tri_all[d] = cart_components(d)` for every
159 /// degree the engine needs (`0..=2·MAX_L`). Built once and sliced for the VRR
160 /// `e`/`f` lists and the HRR triples, instead of re-`collect`ing per quartet.
161 tri_all: Vec<Vec<[usize; 3]>>,
162 /// Flat VRR offset table: `eoff[k·n_e + ae]` is `e`'s packed offset in level
163 /// `k`. Rebuilt per quartet into this reused buffer (no per-quartet `Vec<Vec>`
164 /// allocation); `slab[k]` is one `f`-component's packed size for level `k`.
165 eoff: Vec<usize>,
166 slab: Vec<usize>,
167}
168
169/// One primitive pair's reusable geometry/prefactor, shared by every primitive
170/// quartet that uses it: the combined exponent `zeta = α+β`, the Gaussian product
171/// centre, the overlap prefactor `K = exp(−αβ/ζ·|A−B|²)`, and the two contraction
172/// coefficients (`c1` outer, `c2` inner).
173///
174/// The coefficients are kept **un-multiplied** so the contracted `scale` can be
175/// formed in exactly the former evaluation order `((c_a·c_b)·c_c)·c_d` — making the
176/// engine's output bit-identical to the pre-precomputation code (the per-element
177/// and 8-fold-symmetry tests are floorless-relative and sensitive to the last ULP
178/// on near-cancellation elements).
179#[derive(Debug, Clone, Copy, Default)]
180struct PrimPair {
181 zeta: f64,
182 center: Vec3,
183 kappa: f64,
184 c1: f64,
185 c2: f64,
186 /// `1/(2ζ)` — the VRR `inv_2p`(bra)/`inv_2q`(ket) coefficient. Pure per-pair.
187 inv_2zeta: f64,
188 /// `centre − s1.centre` — the VRR `P−A`(bra)/`Q−C`(ket) shift. Pure per-pair.
189 r1: Vec3,
190}
191
192/// Fill `out` with the [`PrimPair`] data for every primitive combination of two
193/// shells (outer `s1`, inner `s2`), matching the former in-loop computation
194/// element-for-element. The inter-centre distance is hoisted out of the pair loop.
195/// `inv_2zeta` and `r1` are the pair-only VRR coefficients (`1/2ζ` and the `P−A` /
196/// `Q−C` shift) that the kernel formerly recomputed on every primitive quartet.
197fn build_pairs(out: &mut Vec<PrimPair>, s1: ShellRef<'_>, s2: ShellRef<'_>) {
198 out.clear();
199 out.reserve(s1.exps.len() * s2.exps.len());
200 let d2 = dist2(s1.center, s2.center);
201 for (&e1, &c1) in s1.exps.iter().zip(s1.coeffs.iter()) {
202 for (&e2, &c2) in s2.exps.iter().zip(s2.coeffs.iter()) {
203 let zeta = e1 + e2;
204 let center = combine(e1, s1.center, e2, s2.center, zeta);
205 out.push(PrimPair {
206 zeta,
207 center,
208 kappa: (-(e1 * e2 / zeta) * d2).exp(),
209 c1,
210 c2,
211 inv_2zeta: 0.5 / zeta,
212 r1: sub(center, s1.center),
213 });
214 }
215 }
216}
217
218impl EriScratch {
219 /// A fresh, empty arena; it grows to fit on first use.
220 #[must_use]
221 pub fn new() -> Self {
222 Self::default()
223 }
224
225 /// Total `f64` elements currently held across all buffers — the resident
226 /// working set, for memory reporting/tests.
227 #[must_use]
228 pub fn resident_f64(&self) -> usize {
229 self.levels.len()
230 + self.c_ef.len()
231 + self.bra.len()
232 + self.bra_prev.len()
233 + self.bra_cur.len()
234 + self.ket_prev.len()
235 + self.ket_cur.len()
236 }
237
238 /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
239 /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
240 /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
241 #[must_use]
242 pub fn largest_buffer_f64(&self) -> usize {
243 [
244 self.levels.len(),
245 self.c_ef.len(),
246 self.bra.len(),
247 self.bra_prev.len(),
248 self.bra_cur.len(),
249 self.ket_prev.len(),
250 self.ket_cur.len(),
251 ]
252 .into_iter()
253 .max()
254 .unwrap_or(0)
255 }
256}
257
258thread_local! {
259 /// Per-thread default arena backing [`coulomb_shell_into`].
260 static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
261}
262
263/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
264#[inline]
265fn ensure_len(v: &mut Vec<f64>, n: usize) {
266 if v.len() < n {
267 v.resize(n, 0.0);
268 }
269}
270
271/// `ensure_len` for the `usize` offset/slab buffers.
272#[inline]
273fn ensure_usize(v: &mut Vec<usize>, n: usize) {
274 if v.len() < n {
275 v.resize(n, 0);
276 }
277}
278
279/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
280/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
281/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
282///
283/// The engine owns the primitive-quartet loop (the HGP contraction happens between
284/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
285/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
286/// explicit control (e.g. one arena per worker thread) call
287/// [`coulomb_shell_into_scratch`].
288///
289/// # Panics
290/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
291pub fn coulomb_shell_into(
292 a: ShellRef<'_>,
293 b: ShellRef<'_>,
294 c: ShellRef<'_>,
295 d: ShellRef<'_>,
296 out: &mut [f64],
297) {
298 ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
299}
300
301/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
302/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
303/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
304/// compile error); the result is bit-identical regardless of which arena is used or
305/// what it last held.
306///
307/// # Panics
308/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
309pub fn coulomb_shell_into_scratch(
310 scratch: &mut EriScratch,
311 a: ShellRef<'_>,
312 b: ShellRef<'_>,
313 c: ShellRef<'_>,
314 d: ShellRef<'_>,
315 out: &mut [f64],
316) {
317 let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
318 debug_assert!(
319 la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
320 "angular momentum exceeds MAX_L"
321 );
322 let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
323 debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");
324
325 let ne = la + lb; // max bra (A-side) degree built by VRR
326 let nf = lc + ld; // max ket (C-side) degree built by VRR
327 let l_total = ne + nf;
328 let n_e = n_addr(ne);
329 let n_f = n_addr(nf);
330
331 let EriScratch {
332 levels,
333 c_ef,
334 bra,
335 bra_prev,
336 bra_cur,
337 ket_prev,
338 ket_cur,
339 bra_pairs,
340 ket_pairs,
341 tri_all,
342 eoff,
343 slab,
344 } = scratch;
345
346 // Fast path: (ss|ss). With no angular momentum to raise, the whole quartet is a
347 // single contracted [00|00]^(0). Skip the VRR/HRR machinery (the dead raise
348 // coefficients, the `levels` buffer round-trip, the per-primitive call) and sum
349 // straight into a register. Bit-identical to the general path: same `pref`/`t`
350 // expressions, same `((c_a·c_b)·c_c)·c_d` scale, same `scale·(pref·F_0)` term,
351 // same bra-outer/ket-inner order — just without the scaffolding.
352 if l_total == 0 {
353 build_pairs(bra_pairs, a, b);
354 build_pairs(ket_pairs, c, d);
355 let two_pi_2_5 =
356 2.0 * std::f64::consts::PI * std::f64::consts::PI * std::f64::consts::PI.sqrt();
357 let mut acc = 0.0;
358 let mut f0 = [0.0f64; 1];
359 for bra in bra_pairs.iter() {
360 let bc = bra.c1 * bra.c2;
361 for ket in ket_pairs.iter() {
362 let (p, q) = (bra.zeta, ket.zeta);
363 let pq = p + q;
364 let t = (p * q / pq) * dist2(bra.center, ket.center);
365 let pref = two_pi_2_5 / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
366 boys_array(0, t, &mut f0);
367 acc += (bc * ket.c1) * ket.c2 * (pref * f0[0]);
368 }
369 }
370 out[0] += acc;
371 return;
372 }
373
374 // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
375 ensure_len(c_ef, n_e * n_f);
376 c_ef[..n_e * n_f].fill(0.0);
377
378 // Shared Cartesian-triple table, built once (degrees `0..=2·MAX_L` cover every
379 // `ne`/`nf`/`max(ne,nf)` the engine reaches) and sliced below — no per-quartet
380 // `cart_components` allocation.
381 if tri_all.len() <= 2 * MAX_L {
382 *tri_all = (0..=2 * MAX_L).map(cart_components).collect();
383 }
384
385 // Precompute the bra/ket primitive-pair data once (the `exp` prefactor and
386 // product centre depend only on the pair, not the quartet), then loop over the
387 // pair lists. The previous code recomputed the ket pair's `exp`/centre on every
388 // bra primitive — `na·nb` times too many. Bra-outer / ket-inner preserves the
389 // exact accumulation order into `c_ef`.
390 build_pairs(bra_pairs, a, b);
391 build_pairs(ket_pairs, c, d);
392
393 // VRR. The small classes (`ne, nf ≤ 2` — l_total ≤ 4, which carries ~87% of a
394 // cc-pVDZ-style build) dispatch to the monomorphized kernels, where the whole
395 // structural walk (triples, addresses, branches, m-ranges) is resolved at
396 // compile time; everything else runs the general m-marching `vrr_primitive`.
397 // `(0,0)` already returned through the (ss|ss) fast path above.
398 match (ne, nf) {
399 (0, 1) => contract_class::<0, 1>(bra_pairs, ket_pairs, c_ef),
400 (1, 0) => contract_class::<1, 0>(bra_pairs, ket_pairs, c_ef),
401 (1, 1) => contract_class::<1, 1>(bra_pairs, ket_pairs, c_ef),
402 (0, 2) => contract_class::<0, 2>(bra_pairs, ket_pairs, c_ef),
403 (2, 0) => contract_class::<2, 0>(bra_pairs, ket_pairs, c_ef),
404 (1, 2) => contract_class::<1, 2>(bra_pairs, ket_pairs, c_ef),
405 (2, 1) => contract_class::<2, 1>(bra_pairs, ket_pairs, c_ef),
406 (2, 2) => contract_class::<2, 2>(bra_pairs, ket_pairs, c_ef),
407 _ => {
408 // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
409 // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
410 // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
411 // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
412 // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
413 // packed size and `eoff[k·n_e + ae]` is `e`'s offset within it (flat, reused).
414 ensure_usize(slab, nf + 1);
415 ensure_usize(eoff, (nf + 1) * n_e);
416 // `k` indexes both `slab` and the flat `eoff` block base `k·n_e`, so a range
417 // loop is the clear form here.
418 #[allow(clippy::needless_range_loop)]
419 for k in 0..=nf {
420 let base = k * n_e;
421 let mut run = 0usize;
422 let mut ae = 0usize;
423 for de in 0..=ne {
424 let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
425 for _ in 0..n_cart(de) {
426 eoff[base + ae] = run;
427 ae += 1;
428 run += mlen;
429 }
430 }
431 slab[k] = run;
432 }
433 let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
434 ensure_len(levels, 3 * maxlevel);
435 // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
436 // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
437 // instead of silently using a previous quartet's value.
438 #[cfg(debug_assertions)]
439 levels[..3 * maxlevel].fill(f64::NAN);
440
441 // Reborrow the offset/triple buffers as shared slices for the read-only kernel.
442 let (eoff_s, slab_s, tri_s) = (&eoff[..], &slab[..], &tri_all[..]);
443 for bra in bra_pairs.iter() {
444 let bra_coef = bra.c1 * bra.c2; // = c_a·c_b, hoisted out of the ket loop
445 for ket in ket_pairs.iter() {
446 // ((c_a·c_b)·c_c)·c_d — the former left-to-right product, bit for bit.
447 let scale = (bra_coef * ket.c1) * ket.c2;
448 vrr_primitive(
449 bra.zeta,
450 ket.zeta,
451 bra.center,
452 ket.center,
453 bra.kappa,
454 ket.kappa,
455 bra.r1,
456 ket.r1,
457 bra.inv_2zeta,
458 ket.inv_2zeta,
459 ne,
460 nf,
461 l_total,
462 n_e,
463 n_f,
464 maxlevel,
465 eoff_s,
466 slab_s,
467 tri_s,
468 levels,
469 scale,
470 c_ef,
471 );
472 }
473 }
474 }
475 }
476
477 // HRR in contracted space, then scatter the block.
478 let ab = sub(a.center, b.center); // A − B
479 let cd = sub(c.center, d.center); // C − D
480 hrr_and_scatter(
481 la,
482 lb,
483 lc,
484 ld,
485 n_f,
486 c_ef,
487 ab,
488 cd,
489 out,
490 bra,
491 bra_prev,
492 bra_cur,
493 ket_prev,
494 ket_cur,
495 &tri_all[..],
496 );
497}
498
499/// Cartesian triples of degree 1 and 2 in [`cart_components`] order, as `const`
500/// tables so the monomorphized small-class VRR kernels fully unroll over them
501/// (the structural walk — axes, addresses, `has2`/cross branches — then resolves
502/// at compile time instead of being re-derived on every primitive quartet).
503const TRI1: [[usize; 3]; 3] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]];
504const TRI2: [[usize; 3]; 6] = [
505 [2, 0, 0],
506 [1, 1, 0],
507 [1, 0, 1],
508 [0, 2, 0],
509 [0, 1, 1],
510 [0, 0, 2],
511];
512/// The e-components `0..n_addr(2)` in [`addr`] order with their degree —
513/// the bra index set of the small-class kernels (`NE ≤ 2`).
514const E_COMP: [([usize; 3], usize); 10] = [
515 ([0, 0, 0], 0),
516 ([1, 0, 0], 1),
517 ([0, 1, 0], 1),
518 ([0, 0, 1], 1),
519 ([2, 0, 0], 2),
520 ([1, 1, 0], 2),
521 ([1, 0, 1], 2),
522 ([0, 2, 0], 2),
523 ([0, 1, 1], 2),
524 ([0, 0, 2], 2),
525];
526
527/// Contract a whole shell quartet of VRR shape `(NE, NF)` (with `NE, NF ≤ 2`)
528/// into `c_ef` using the monomorphized small-class kernel [`vrr_small`].
529///
530/// Same bra-outer / ket-inner primitive loop and the same left-to-right
531/// `((c_a·c_b)·c_c)·c_d` scale as the general path, so the accumulation into
532/// `c_ef` is bit-identical. The VRR working tables live here (overwritten per
533/// primitive quartet in the region read, like the general path's `levels`
534/// arena) so their one-time zero-init is per shell quartet, not per primitive.
535fn contract_class<const NE: usize, const NF: usize>(
536 bra_pairs: &[PrimPair],
537 ket_pairs: &[PrimPair],
538 c_ef: &mut [f64],
539) {
540 // Worst-case (NE,NF)=(2,2) sizes: e-table 10 components × m ∈ 0..=4;
541 // f-degree-1 level 10×3 × m ∈ 0..=3; f-degree-2 level 10×6 × m ∈ 0..=2.
542 let mut ea = [0.0f64; 50];
543 let mut eb1 = [0.0f64; 120];
544 let mut eb2 = [0.0f64; 180];
545 for bra in bra_pairs {
546 let bra_coef = bra.c1 * bra.c2;
547 for ket in ket_pairs {
548 let scale = (bra_coef * ket.c1) * ket.c2;
549 vrr_small::<NE, NF>(bra, ket, scale, &mut ea, &mut eb1, &mut eb2, c_ef);
550 }
551 }
552}
553
554/// One primitive quartet of the OS VRR for the small classes `NE, NF ≤ 2`,
555/// monomorphized per `(NE, NF)` — the per-L-class "generated kernel" idea of
556/// libcint/libint expressed through const generics: every loop bound, Cartesian
557/// triple, address, and `has2`/cross branch below is a compile-time constant in
558/// each instantiation, so the emitted code is a straight unrolled FMA chain.
559///
560/// **Bit-identical to [`vrr_primitive`] by construction**: the same recurrence
561/// expressions with the same term order and association — base
562/// `pref·F_m`, raise `pa_i·s1[m] + wp_i·s1[m+1]`, then `+= coef2·(s2[m] −
563/// (q/pq)·s2[m+1])`, then `+= cross·s3[m+1]` — the same m-ranges
564/// (`m ≤ l_total − de − k`), and the same `c_ef[ae·n_f + af] += scale·v`
565/// extraction. Only the storage differs (fixed per-degree tables instead of the
566/// triangle-packed rolling window), which does not touch the arithmetic.
567/// Guarded by the full-tensor XOR fingerprint and the golden/cross-engine tests.
568#[inline(always)]
569#[allow(clippy::too_many_arguments)]
570fn vrr_small<const NE: usize, const NF: usize>(
571 bra: &PrimPair,
572 ket: &PrimPair,
573 scale: f64,
574 ea: &mut [f64; 50],
575 eb1: &mut [f64; 120],
576 eb2: &mut [f64; 180],
577 c_ef: &mut [f64],
578) {
579 let lt = NE + NF;
580 let n_e = n_addr(NE);
581 let n_f = n_addr(NF);
582 let (p, q) = (bra.zeta, ket.zeta);
583 let (pc, qc) = (bra.center, ket.center);
584 let pa = bra.r1; // P − A
585 let qcen = ket.r1; // Q − C
586 let (inv_2p, inv_2q) = (bra.inv_2zeta, ket.inv_2zeta);
587
588 // Scalar header — identical expressions to `vrr_primitive`.
589 let pq = p + q;
590 let rho = p * q / pq;
591 let pq_vec = sub(pc, qc); // P − Q
592 let t = rho * norm2(pq_vec);
593 let w = [
594 (p * pc[0] + q * qc[0]) / pq,
595 (p * pc[1] + q * qc[1]) / pq,
596 (p * pc[2] + q * qc[2]) / pq,
597 ];
598 let wp = sub(w, pc); // W − P
599 let wq = sub(w, qc); // W − Q
600 use std::f64::consts::PI;
601 let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * bra.kappa * ket.kappa;
602 let mut fm = [0.0f64; 5];
603 boys_array(lt, t, &mut fm[..=lt]);
604 for m in 0..=lt {
605 ea[m] = pref * fm[m];
606 }
607
608 let inv_2pq = 0.5 / pq;
609 let q_over_pq = q / pq;
610 let p_over_pq = p / pq;
611
612 // Phase A — bra ladder: [e0|00]^(m), e-degrees 1..=NE, m ≤ lt − de.
613 if NE >= 1 {
614 for (iw, te) in TRI1.iter().enumerate() {
615 let i = lower_axis(*te);
616 // Source is [0,0,0] at addr 0; no has2 term at degree 1.
617 for m in 0..=(lt - 1) {
618 ea[(1 + iw) * 5 + m] = pa[i] * ea[m] + wp[i] * ea[m + 1];
619 }
620 }
621 }
622 if NE >= 2 {
623 for (iw, te) in TRI2.iter().enumerate() {
624 let i = lower_axis(*te);
625 let s1 = addr(dec(*te, i));
626 let has2 = te[i] >= 2;
627 let coef2 = ((te[i] - 1) as f64) * inv_2p;
628 let s2 = if has2 { addr(dec(dec(*te, i), i)) } else { 0 };
629 for m in 0..=(lt - 2) {
630 let mut v = pa[i] * ea[s1 * 5 + m] + wp[i] * ea[s1 * 5 + m + 1];
631 if has2 {
632 v += coef2 * (ea[s2 * 5 + m] - q_over_pq * ea[s2 * 5 + m + 1]);
633 }
634 ea[(4 + iw) * 5 + m] = v;
635 }
636 }
637 }
638 // Extract f-degree 0: contract m = 0 of every e-component.
639 for ae in 0..n_e {
640 c_ef[ae * n_f] += scale * ea[ae * 5];
641 }
642
643 // Phase B — ket raise, f-degree 1: m ≤ lt − de − 1.
644 if NF >= 1 {
645 for (fw, tf) in TRI1.iter().enumerate() {
646 let j = lower_axis(*tf);
647 // Source f-component is [0,0,0] (level 0 = the phase-A table).
648 for ae in 0..n_e {
649 let (te, de) = E_COMP[ae];
650 let has_cross = te[j] >= 1;
651 let cross_coef = te[j] as f64 * inv_2pq;
652 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
653 for m in 0..=(lt - de - 1) {
654 let mut v = qcen[j] * ea[ae * 5 + m] + wq[j] * ea[ae * 5 + m + 1];
655 if has_cross {
656 v += cross_coef * ea[cs * 5 + m + 1];
657 }
658 eb1[(ae * 3 + fw) * 4 + m] = v;
659 }
660 }
661 }
662 for ae in 0..n_e {
663 for fw in 0..3 {
664 c_ef[ae * n_f + 1 + fw] += scale * eb1[(ae * 3 + fw) * 4];
665 }
666 }
667 }
668
669 // Phase B — ket raise, f-degree 2: m ≤ lt − de − 2.
670 if NF >= 2 {
671 for (fw, tf) in TRI2.iter().enumerate() {
672 let j = lower_axis(*tf);
673 let f1 = dec(*tf, j);
674 let lf1 = cart_index(f1); // f-component index in the degree-1 level
675 let has2 = tf[j] >= 2; // its k−2 source is then [0,0,0] (phase A)
676 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
677 for ae in 0..n_e {
678 let (te, de) = E_COMP[ae];
679 let has_cross = te[j] >= 1;
680 let cross_coef = te[j] as f64 * inv_2pq;
681 let cs = if has_cross { addr(dec(te, j)) } else { 0 };
682 for m in 0..=(lt - de - 2) {
683 let mut v = qcen[j] * eb1[(ae * 3 + lf1) * 4 + m]
684 + wq[j] * eb1[(ae * 3 + lf1) * 4 + m + 1];
685 if has2 {
686 v += coef2 * (ea[ae * 5 + m] - p_over_pq * ea[ae * 5 + m + 1]);
687 }
688 if has_cross {
689 v += cross_coef * eb1[(cs * 3 + lf1) * 4 + m + 1];
690 }
691 eb2[(ae * 6 + fw) * 3 + m] = v;
692 }
693 }
694 }
695 for ae in 0..n_e {
696 for fw in 0..6 {
697 c_ef[ae * n_f + 4 + fw] += scale * eb2[(ae * 6 + fw) * 3];
698 }
699 }
700 }
701}
702
703/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
704/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
705///
706/// Algebraically identical to the former full-table VRR (same OS recurrences, same
707/// term order) — only the storage changes, so it is bit-identical
708/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
709/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
710/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
711/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
712/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
713/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
714/// buffer is recycled.
715#[allow(clippy::too_many_arguments)]
716fn vrr_primitive(
717 p: f64,
718 q: f64,
719 pc: Vec3,
720 qc: Vec3,
721 kab: f64,
722 kcd: f64,
723 pa: Vec3, // P − A (bra pair, precomputed)
724 qcen: Vec3, // Q − C (ket pair, precomputed)
725 inv_2p: f64, // 1/(2p) (bra pair, precomputed)
726 inv_2q: f64, // 1/(2q) (ket pair, precomputed)
727 ne: usize,
728 nf: usize,
729 l_total: usize,
730 n_e: usize,
731 n_f: usize,
732 maxlevel: usize,
733 eoff: &[usize],
734 slab: &[usize],
735 tri: &[Vec<[usize; 3]>],
736 levels: &mut [f64],
737 scale: f64,
738 c_ef: &mut [f64],
739) {
740 let pq = p + q;
741 let rho = p * q / pq;
742 let pq_vec = sub(pc, qc); // P − Q
743 let t = rho * norm2(pq_vec);
744 let w = [
745 (p * pc[0] + q * qc[0]) / pq,
746 (p * pc[1] + q * qc[1]) / pq,
747 (p * pc[2] + q * qc[2]) / pq,
748 ];
749 // `pa` (P−A) and `qcen` (Q−C) are now passed in (pure per-pair).
750 let wp = sub(w, pc); // W − P
751 let wq = sub(w, qc); // W − Q
752
753 // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
754 // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
755 // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
756 // source and destination across the distinct slots `k`, `k−1`, `k−2`.
757 let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
758 (k % 3) * maxlevel + lf * slab[k] + eoff[k * n_e + ae] + m
759 };
760
761 // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
762 // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
763 // bound; an ERI couples both electrons).
764 use std::f64::consts::PI;
765 let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
766 let mut fm = [0.0f64; 4 * MAX_L + 1];
767 boys_array(l_total, t, &mut fm[..=l_total]);
768 for m in 0..=l_total {
769 levels[off(0, 0, 0, m)] = pref * fm[m];
770 }
771
772 // `inv_2p` / `inv_2q` are now passed in (pure per-pair).
773 let inv_2pq = 0.5 / pq;
774 let q_over_pq = q / pq;
775 let p_over_pq = p / pq;
776
777 // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
778 // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
779 // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
780 let eoff0 = &eoff[..n_e];
781 for (na, te_list) in tri.iter().enumerate().take(ne + 1).skip(1) {
782 for &te in te_list {
783 let i = lower_axis(te);
784 let s1a = addr(dec(te, i));
785 let mmax = l_total - na;
786 let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
787 let has2 = te[i] >= 2;
788 let s1_0 = eoff0[s1a];
789 let s2_0 = if has2 {
790 eoff0[addr(dec(dec(te, i), i))]
791 } else {
792 0
793 };
794 let dst_0 = eoff0[addr(te)];
795 for m in 0..=mmax {
796 let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
797 if has2 {
798 v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
799 }
800 levels[dst_0 + m] = v;
801 }
802 }
803 }
804 // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
805 for ae in 0..n_e {
806 c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
807 }
808
809 // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
810 // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
811 // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
812 // the former full-table inner-loop cost. The k−2 level is read only when the axis
813 // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
814 for k in 1..=nf {
815 let slot_k = (k % 3) * maxlevel;
816 let slot_k1 = ((k - 1) % 3) * maxlevel;
817 let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
818 let eoff_k = &eoff[k * n_e..];
819 let eoff_k1 = &eoff[(k - 1) * n_e..];
820 let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
821 (
822 ((k - 2) % 3) * maxlevel,
823 slab[k - 2],
824 &eoff[(k - 2) * n_e..],
825 )
826 } else {
827 (0, 0, &[])
828 };
829 for &tf in &tri[k] {
830 let j = lower_axis(tf);
831 let f1 = dec(tf, j);
832 let lf1 = cart_index(f1); // local index in level k−1
833 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
834 let has2 = tf[j] >= 2;
835 let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
836 let lf = cart_index(tf); // local index in level k
837 let sk = slot_k + lf * slab_k;
838 let sk1 = slot_k1 + lf1 * slab_k1;
839 let sk2 = slot_k2 + lf2 * slab_k2;
840 for (nadeg, te_list) in tri.iter().enumerate().take(ne + 1) {
841 for &te in te_list {
842 let ea = addr(te);
843 // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
844 let has_cross = te[j] >= 1;
845 let (cross_coef, cross_0) = if has_cross {
846 (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
847 } else {
848 (0.0, 0)
849 };
850 let mmax = l_total - nadeg - k;
851 let dst_0 = sk + eoff_k[ea];
852 let src1_0 = sk1 + eoff_k1[ea];
853 let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
854 for m in 0..=mmax {
855 let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
856 if has2 {
857 v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
858 }
859 if has_cross {
860 v += cross_coef * levels[cross_0 + m + 1];
861 }
862 levels[dst_0 + m] = v;
863 }
864 }
865 }
866 }
867 // Extract level k: contract each f-component's m = 0 slice into c_ef.
868 for &tf in &tri[k] {
869 let lf = cart_index(tf);
870 let af = addr(tf);
871 for ae in 0..n_e {
872 c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
873 }
874 }
875 }
876}
877
878/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
879/// the row-major output block.
880///
881/// Flat-array HGP horizontal recurrence, replacing the earlier
882/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
883/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
884/// term order — so it is **bit-identical** to the recursive version (guarded by
885/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
886/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
887///
888/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
889/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
890/// independently per ket index `f` (a spectator), into the `bra` intermediate.
891/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
892/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
893/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
894/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
895#[allow(clippy::too_many_arguments)]
896fn hrr_and_scatter<'a>(
897 la: usize,
898 lb: usize,
899 lc: usize,
900 ld: usize,
901 n_f: usize,
902 c_ef: &[f64],
903 ab: Vec3,
904 cd: Vec3,
905 out: &mut [f64],
906 bra: &mut Vec<f64>,
907 mut prev: &'a mut Vec<f64>,
908 mut cur: &'a mut Vec<f64>,
909 mut kprev: &'a mut Vec<f64>,
910 mut kcur: &'a mut Vec<f64>,
911 // Shared Cartesian-triple table (degrees `0..=2·MAX_L`); HRR indexes it up to
912 // `max(ne, nf)`. Passed in so it is not re-`collect`ed per quartet.
913 tri: &[Vec<[usize; 3]>],
914) {
915 let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
916 let ne = la + lb; // max bra (A-side) degree present in c_ef
917 let nf = lc + ld; // max ket (C-side) degree present in c_ef
918 let n_e = n_addr(ne);
919
920 // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
921 // the ket base are keyed by the global `addr(f)` offset by `f_base`.
922 let f_base = tri_below(lc);
923 let nf_range = n_f - f_base;
924
925 // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
926 // Reused arena buffers; bra is fully overwritten below, the rolling
927 // layers in the region read — debug NaN-fill bra so a missed write surfaces.
928 let bra_len = na * nb * nf_range;
929 ensure_len(bra, bra_len);
930 #[cfg(debug_assertions)]
931 bra[..bra_len].fill(f64::NAN);
932 // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
933 let layer_len = n_cart(lb) * n_e;
934 ensure_len(prev, layer_len);
935 ensure_len(cur, layer_len);
936 for f_global in f_base..n_f {
937 let jf = f_global - f_base;
938 // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
939 for &a in tri[la..=ne].iter().flatten() {
940 let ae = addr(a);
941 prev[ae] = c_ef[ae * n_f + f_global];
942 }
943 for kb in 1..=lb {
944 for (ibw, &b) in tri[kb].iter().enumerate() {
945 let i = lower_axis(b);
946 let b1w = cart_index(dec(b, i));
947 // a-degrees that can still reach (la, lb): [la..=ne−kb].
948 for &a in tri[la..=(ne - kb)].iter().flatten() {
949 let ae = addr(a);
950 let a1e = addr(inc(a, i));
951 cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
952 }
953 }
954 std::mem::swap(&mut prev, &mut cur);
955 }
956 // `prev` now holds b-degree lb (or the base when lb = 0): extract.
957 for (ib, &b) in tri[lb].iter().enumerate() {
958 let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
959 for (ia, &a) in tri[la].iter().enumerate() {
960 bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
961 }
962 }
963 }
964
965 // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
966 let klayer_len = n_cart(ld) * n_f;
967 ensure_len(kprev, klayer_len);
968 ensure_len(kcur, klayer_len);
969 for ia in 0..na {
970 for ib in 0..nb {
971 let brarow = (ia * nb + ib) * nf_range;
972 // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
973 for &c in tri[lc..=nf].iter().flatten() {
974 let ce = addr(c);
975 kprev[ce] = bra[brarow + (ce - f_base)];
976 }
977 for kd in 1..=ld {
978 for (idw, &d) in tri[kd].iter().enumerate() {
979 let j = lower_axis(d);
980 let d1w = cart_index(dec(d, j));
981 for &c in tri[lc..=(nf - kd)].iter().flatten() {
982 let ce = addr(c);
983 let c1e = addr(inc(c, j));
984 kcur[idw * n_f + ce] =
985 kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
986 }
987 }
988 std::mem::swap(&mut kprev, &mut kcur);
989 }
990 // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
991 for (ic, &c) in tri[lc].iter().enumerate() {
992 let ce = addr(c);
993 for id in 0..nd {
994 out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
995 }
996 }
997 }
998 }
999}
1000
1001// --- small helpers ---
1002
1003/// First nonzero axis of a triple (the lowering direction).
1004#[inline]
1005fn lower_axis(t: [usize; 3]) -> usize {
1006 if t[0] > 0 {
1007 0
1008 } else if t[1] > 0 {
1009 1
1010 } else {
1011 2
1012 }
1013}
1014
1015#[inline]
1016fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
1017 t[i] -= 1;
1018 t
1019}
1020
1021#[inline]
1022fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
1023 t[i] += 1;
1024 t
1025}
1026
1027#[inline]
1028fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
1029 [
1030 (a * ca[0] + b * cb[0]) / p,
1031 (a * ca[1] + b * cb[1]) / p,
1032 (a * ca[2] + b * cb[2]) / p,
1033 ]
1034}
1035
1036#[inline]
1037fn sub(u: Vec3, v: Vec3) -> Vec3 {
1038 [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
1039}
1040
1041#[inline]
1042fn dist2(u: Vec3, v: Vec3) -> f64 {
1043 norm2(sub(u, v))
1044}
1045
1046#[inline]
1047fn norm2(u: Vec3) -> f64 {
1048 u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
1049}
1050
1051#[cfg(test)]
1052mod tests {
1053 use super::*;
1054
1055 /// Single-primitive `ShellRef` helper.
1056 fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
1057 (center, l, [exp], [1.0])
1058 }
1059
1060 /// (ss|ss) over four unit s primitives must equal the closed form
1061 /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
1062 /// engine passes, so the two share a verified base case.
1063 #[test]
1064 fn ssss_matches_closed_form() {
1065 let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
1066 let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
1067 let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
1068 let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
1069 let mut out = [0.0; 1];
1070 coulomb_shell_into(
1071 ShellRef {
1072 center: ac,
1073 l: al,
1074 exps: &ae,
1075 coeffs: &acf,
1076 },
1077 ShellRef {
1078 center: bc,
1079 l: bl,
1080 exps: &be,
1081 coeffs: &bcf,
1082 },
1083 ShellRef {
1084 center: cc,
1085 l: cl,
1086 exps: &ce,
1087 coeffs: &ccf,
1088 },
1089 ShellRef {
1090 center: dc,
1091 l: dl,
1092 exps: &de,
1093 coeffs: &dcf,
1094 },
1095 &mut out,
1096 );
1097
1098 let p = 0.8 + 1.3;
1099 let q = 0.5 + 1.1;
1100 let pcen = combine(0.8, ac, 1.3, bc, p);
1101 let qcen = combine(0.5, cc, 1.1, dc, q);
1102 let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
1103 let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
1104 let rho = p * q / (p + q);
1105 let t = rho * dist2(pcen, qcen);
1106 let mut fm = [0.0; 1];
1107 boys_array(0, t, &mut fm);
1108 use std::f64::consts::PI;
1109 let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
1110 assert!(
1111 (out[0] - expect).abs() < 1e-14 * expect.abs(),
1112 "ssss {} vs {}",
1113 out[0],
1114 expect
1115 );
1116 }
1117
1118 use crate::os::Prim;
1119 use crate::rys::coulomb_into;
1120
1121 /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
1122 /// center)`, for cross-checking against the Rys engine.
1123 #[allow(clippy::too_many_arguments)]
1124 fn os_block(
1125 la: usize,
1126 ea: f64,
1127 ca: Vec3,
1128 lb: usize,
1129 eb: f64,
1130 cb: Vec3,
1131 lc: usize,
1132 recc: f64,
1133 ccc: Vec3,
1134 ld: usize,
1135 ed: f64,
1136 cdd: Vec3,
1137 ) -> Vec<f64> {
1138 let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
1139 let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
1140 let one = [1.0];
1141 coulomb_shell_into(
1142 ShellRef {
1143 center: ca,
1144 l: la,
1145 exps: &ea1,
1146 coeffs: &one,
1147 },
1148 ShellRef {
1149 center: cb,
1150 l: lb,
1151 exps: &eb1,
1152 coeffs: &one,
1153 },
1154 ShellRef {
1155 center: ccc,
1156 l: lc,
1157 exps: &ec1,
1158 coeffs: &one,
1159 },
1160 ShellRef {
1161 center: cdd,
1162 l: ld,
1163 exps: &ed1,
1164 coeffs: &one,
1165 },
1166 &mut out,
1167 );
1168 out
1169 }
1170
1171 /// OS/HGP must reproduce the Rys engine element-for-element on a single
1172 /// primitive quartet, across a sweep of angular momenta (including the
1173 /// bug-prone mixed-high-L and "all four different L on four centers" cases).
1174 #[test]
1175 fn matches_rys_engine_primitive_sweep() {
1176 let ca = [0.0, 0.0, 0.0];
1177 let cb = [0.5, -0.3, 0.2];
1178 let cc = [-0.4, 0.6, -0.1];
1179 let cd = [0.2, 0.4, 0.8];
1180 let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);
1181
1182 let quartets = [
1183 (0usize, 0usize, 0usize, 0usize),
1184 (1, 0, 0, 0),
1185 (0, 0, 1, 0),
1186 (1, 1, 0, 0),
1187 (1, 0, 1, 0),
1188 (1, 1, 1, 1),
1189 (2, 0, 0, 0),
1190 (2, 1, 0, 0),
1191 (2, 1, 2, 1),
1192 (0, 1, 2, 3), // four different L on four centers
1193 (2, 2, 3, 3), // (dd|ff) mixed high-L
1194 (3, 0, 0, 1),
1195 // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
1196 // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
1197 // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
1198 (4, 4, 4, 1), // l_total = 13
1199 (6, 6, 1, 0), // l_total = 13, two i-shells
1200 (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
1201 ];
1202 for (la, lb, lc, ld) in quartets {
1203 let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
1204 let mut rys = vec![0.0; os.len()];
1205 coulomb_into(
1206 Prim::new(ea, ca, la),
1207 Prim::new(eb, cb, lb),
1208 Prim::new(ec, cc, lc),
1209 Prim::new(ed, cd, ld),
1210 1.0,
1211 &mut rys,
1212 );
1213 assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
1214 }
1215 }
1216
1217 /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
1218 /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
1219 /// The atol floor absorbs benign near-cancellation on structurally tiny
1220 /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
1221 /// even though both engines are correct — their dominant elements agree to
1222 /// ~1e-12); the rtol catches any real divergence.
1223 fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
1224 const ATOL: f64 = 1e-11;
1225 const RTOL: f64 = 1e-10;
1226 for (o, r) in os.iter().zip(rys.iter()) {
1227 assert!(
1228 (o - r).abs() <= ATOL + RTOL * r.abs(),
1229 "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
1230 (o - r).abs()
1231 );
1232 }
1233 }
1234
1235 /// HGP early contraction (VRR per primitive, HRR once in contracted space)
1236 /// must equal the per-primitive Rys sum for a genuinely **contracted**
1237 /// quartet.
1238 ///
1239 /// Note (`DESIGN_NOTES.md` D6): doing the HRR after contraction is a *performance*
1240 /// choice, **not** a correctness fork. The HRR operator is linear with
1241 /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
1242 /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
1243 /// and post-contraction HRR give the *same* answer. This remains a valuable
1244 /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
1245 /// coefficient, or a wrong HRR sign would all fail it), but it does not
1246 /// distinguish HRR *ordering*, because the two orders are algebraically equal.
1247 #[test]
1248 fn contracted_quartet_matches_rys_sum() {
1249 // Two p shells and a d shell, each with multiple primitives, on distinct
1250 // centres (so HRR shift vectors A−B, C−D are all non-trivial).
1251 let ca = [0.0, 0.0, 0.0];
1252 let cb = [0.6, -0.2, 0.1];
1253 let cc = [-0.3, 0.5, -0.2];
1254 let cd = [0.2, 0.3, 0.7];
1255 let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
1256 let ax = [1.4, 0.45];
1257 let acf = [0.6, 0.5];
1258 let bx = [0.9, 0.3];
1259 let bcf = [0.55, 0.5];
1260 let cx = [1.1, 0.4];
1261 let ccf = [0.7, 0.4];
1262 let dx = [0.8];
1263 let dcf = [1.0];
1264
1265 let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
1266 coulomb_shell_into(
1267 ShellRef {
1268 center: ca,
1269 l: la,
1270 exps: &ax,
1271 coeffs: &acf,
1272 },
1273 ShellRef {
1274 center: cb,
1275 l: lb,
1276 exps: &bx,
1277 coeffs: &bcf,
1278 },
1279 ShellRef {
1280 center: cc,
1281 l: lc,
1282 exps: &cx,
1283 coeffs: &ccf,
1284 },
1285 ShellRef {
1286 center: cd,
1287 l: ld,
1288 exps: &dx,
1289 coeffs: &dcf,
1290 },
1291 &mut os,
1292 );
1293
1294 // Reference: sum the Rys primitive engine over the quartet with the same
1295 // effective coefficients.
1296 let mut rys = vec![0.0; os.len()];
1297 for (&ea, &wa) in ax.iter().zip(acf.iter()) {
1298 for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
1299 for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
1300 for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
1301 coulomb_into(
1302 Prim::new(ea, ca, la),
1303 Prim::new(eb, cb, lb),
1304 Prim::new(ec, cc, lc),
1305 Prim::new(ed, cd, ld),
1306 wa * wb * wc * wd,
1307 &mut rys,
1308 );
1309 }
1310 }
1311 }
1312 }
1313
1314 for (o, r) in os.iter().zip(rys.iter()) {
1315 assert!(
1316 (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
1317 "contracted OS vs Rys mismatch: {o} vs {r}"
1318 );
1319 }
1320 }
1321
1322 /// The triple-address map must be a bijection onto `0..n_addr(L)`.
1323 #[test]
1324 fn addr_is_bijective() {
1325 for lmax in 0..=6 {
1326 let mut seen = vec![false; n_addr(lmax)];
1327 for n in 0..=lmax {
1328 for t in cart_components(n) {
1329 let a = addr(t);
1330 assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
1331 assert!(!seen[a], "addr collision at {t:?}");
1332 seen[a] = true;
1333 }
1334 }
1335 assert!(
1336 seen.iter().all(|&x| x),
1337 "addr not surjective at lmax {lmax}"
1338 );
1339 }
1340 }
1341}