integral_core/os_eri.rs
1//! Two-electron repulsion integrals (ERIs) by the Obara–Saika / Head-Gordon–Pople
2//! (OS/HGP) recurrence — the second ERI engine (see `ARCHITECTURE.md`, L1).
3//!
4//! A **vertical recurrence (VRR)** builds the
5//! intermediate `[e0|f0]^(m)` classes from `[00|00]^(m)` per primitive quartet,
6//! the primitives are **contracted** into AO space, and a **horizontal recurrence
7//! (HRR)** then shifts angular momentum `A→B` and `C→D` *in the contracted space*.
8//! Doing the HRR after contraction is the HGP "early contraction" trick: the
9//! geometry-only HRR runs **once per shell quartet** instead of once per
10//! primitive quartet, which is the win at high contraction degree.
11//!
12//! It is the engineering counterpart of the [`crate::rys`] engine: same Coulomb
13//! kernel, same row-major `(a,b,c,d)` block layout, validated element-for-element
14//! against it (and against an independent McMurchie–Davidson path).
15//!
16//! ## Method
17//!
18//! For a primitive quartet with exponents `α,β` on the bra centres `A,B` and
19//! `γ,δ` on the ket centres `C,D`, with `p=α+β`, `q=γ+δ`, `P,Q` the Gaussian
20//! product centres, `W=(pP+qQ)/(p+q)`, `ρ=pq/(p+q)`, and `T=ρ|P−Q|²`:
21//!
22//! ```text
23//! [00|00]^(m) = 2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_m(T)
24//! ```
25//!
26//! The VRR raises angular momentum on `A` (the bra, index `e`) and `C` (the ket,
27//! index `f`) using the standard OS ERI relations (Obara–Saika 1986; HGP 1988):
28//!
29//! ```text
30//! [e+1_i,0|f0]^(m) = (P−A)_i[e0|f0]^(m) + (W−P)_i[e0|f0]^(m+1)
31//! + e_i/2p ( [e−1_i,0|f0]^(m) − (q/(p+q))[e−1_i,0|f0]^(m+1) )
32//! + f_i/2(p+q) [e0|f−1_i,0]^(m+1)
33//! [e0|f+1_i,0]^(m) = (Q−C)_i[e0|f0]^(m) + (W−Q)_i[e0|f0]^(m+1)
34//! + f_i/2q ( [e0|f−1_i,0]^(m) − (p/(p+q))[e0|f−1_i,0]^(m+1) )
35//! + e_i/2(p+q) [e−1_i,0|f0]^(m+1)
36//! ```
37//!
38//! After contracting `[e0|f0]^(0)` over the primitive quartet, the HRR builds the
39//! Cartesian shell block:
40//!
41//! ```text
42//! (a,b+1_i | f0) = (a+1_i,b | f0) + (A−B)_i (a,b | f0) [bra, A→B]
43//! (ab | c,d+1_i) = (ab | c+1_i,d) + (C−D)_i (ab | c,d) [ket, C→D]
44//! ```
45//!
46//! ## Buffers (see `DESIGN_NOTES.md` D3, D12)
47//!
48//! The VRR table `[e0|f0]^(m)` is **3D and `W`-coupled** (the recurrence mixes
49//! Cartesian axes), so unlike the axis-separable 1D tables of the one-electron and
50//! Rys engines it cannot be a small fixed stack array — a `MAX_L` stack buffer
51//! would be tens of MB, and even a heap copy of the *full* table is ~41 MB at
52//! `(ii|ii)`. The current engine (D12) removes that:
53//!
54//! - **m-marching VRR** (`vrr_primitive`): the VRR never materialises the full
55//! `e×f×m` table. It marches over the ket `f`-degree keeping only a rolling window
56//! of **3 consecutive `f`-degree levels**, each **triangle-packed** in the Boys
57//! index `m`. Resident VRR footprint is `3·max_k[n_cart(k)·slab_k]` (≈ 4.3 MB at
58//! `(ii|ii)`, vs the old 41 MB single table), plus the `n_e·n_f` contracted table.
59//! - **flat-array HRR** (`hrr_and_scatter`): contiguous arrays indexed by `addr`
60//! / [`cart_index`] with two rolling degree layers, replacing the former HashMap
61//! memoisation.
62//! - **reusable arena** ([`EriScratch`]): all buffers are allocated once and reused
63//! across quartets (thread-local by default), not re-allocated per quartet.
64//!
65//! All of this stays in safe Rust (`#![forbid(unsafe_code)]`) and reproduces the
66//! former full-table engine's values, cross-checked against the Rys engine and an
67//! independent McMurchie–Davidson path (`tests/eri_cross_algorithm.rs`).
68
69use std::cell::RefCell;
70
71use integral_math::am::{cart_components, cart_index, n_cart};
72use integral_math::boys::boys_array;
73
74use crate::os::{Vec3, MAX_L};
75
76/// A contracted Cartesian shell as seen by the HGP engine: its centre, angular
77/// momentum, and primitive `(exponent, effective-coefficient)` data. The
78/// coefficients are the driver's *effective* coefficients `d_i · N(α_i, l)` — the
79/// engine itself works on un-normalised monomials and only multiplies the four
80/// coefficients into the contracted accumulator.
81#[derive(Debug, Clone, Copy)]
82pub struct ShellRef<'a> {
83 /// Shell centre (bohr).
84 pub center: Vec3,
85 /// Angular momentum.
86 pub l: usize,
87 /// Primitive exponents.
88 pub exps: &'a [f64],
89 /// Effective contraction coefficients, aligned with `exps`.
90 pub coeffs: &'a [f64],
91}
92
93/// Number of Cartesian triples of degree `0..=lmax`: `(L+1)(L+2)(L+3)/6`.
94#[inline]
95fn n_addr(lmax: usize) -> usize {
96 (lmax + 1) * (lmax + 2) * (lmax + 3) / 6
97}
98
99/// Number of Cartesian triples of degree **strictly less than** `d`:
100/// `d(d+1)(d+2)/6`. This is the cumulative base of degree `d` in `addr`
101/// (`addr(t) = tri_below(deg) + cart_index(t)`) and `n_addr(L) = tri_below(L+1)`.
102#[inline]
103fn tri_below(d: usize) -> usize {
104 d * (d + 1) * (d + 2) / 6
105}
106
107/// Global address of a Cartesian triple within the cumulative `0..=` ordering,
108/// consistent with [`integral_math::am`]'s per-degree canonical order. Bijective onto
109/// `0..n_addr(deg)`.
110#[inline]
111fn addr(t: [usize; 3]) -> usize {
112 let n = t[0] + t[1] + t[2];
113 let base = n * (n + 1) * (n + 2) / 6;
114 // within-degree index: (n-lx)(n-lx+1)/2 + lz
115 let within = (n - t[0]) * (n - t[0] + 1) / 2 + t[2];
116 base + within
117}
118
119/// Reusable scratch arena for the OS/HGP ERI engine.
120///
121/// Holds the engine's working buffers so they are allocated **once and reused**
122/// across shell quartets, instead of a fresh heap allocation per quartet inside the
123/// O(n⁴) shell loop. All buffers are plain `Vec<f64>` grown on demand in safe Rust
124/// (`#![forbid(unsafe_code)]` holds); there is no shared mutable state, so each
125/// thread uses its own instance — the thread-local default behind
126/// [`coulomb_shell_into`], or one passed explicitly to
127/// [`coulomb_shell_into_scratch`] (the `&mut` makes cross-thread sharing a compile
128/// error).
129///
130/// **Memory-correctness.** `c_ef` is an accumulator and is **zeroed per quartet**.
131/// The VRR `levels` and HRR `bra`/layer buffers are fully overwritten in the region
132/// they are later read, so they need no functional zeroing — but in debug builds
133/// `levels` and `bra` are NaN-filled per quartet, so any accidental out-of-range
134/// read (e.g. outside the VRR `m`-triangle) poisons the output and trips the tests
135/// rather than silently reading a stale value from a previous quartet. Results are
136/// therefore independent of evaluation order and of arena reuse (asserted by
137/// `tests/arena.rs`).
138#[derive(Debug, Default, Clone)]
139pub struct EriScratch {
140 /// VRR rolling window: 3 triangle-packed f-degree levels (slot = `k % 3`).
141 levels: Vec<f64>,
142 /// Contracted `[e0|f0]^(0)` accumulator, shape `[n_e·n_f]`.
143 c_ef: Vec<f64>,
144 /// HRR bra intermediate `(a b | f 0)`, shape `[na·nb·nf_range]`.
145 bra: Vec<f64>,
146 /// HRR bra rolling `b`-degree layers.
147 bra_prev: Vec<f64>,
148 bra_cur: Vec<f64>,
149 /// HRR ket rolling `d`-degree layers.
150 ket_prev: Vec<f64>,
151 ket_cur: Vec<f64>,
152}
153
154impl EriScratch {
155 /// A fresh, empty arena; it grows to fit on first use.
156 #[must_use]
157 pub fn new() -> Self {
158 Self::default()
159 }
160
161 /// Total `f64` elements currently held across all buffers — the resident
162 /// working set, for memory reporting/tests.
163 #[must_use]
164 pub fn resident_f64(&self) -> usize {
165 self.levels.len()
166 + self.c_ef.len()
167 + self.bra.len()
168 + self.bra_prev.len()
169 + self.bra_cur.len()
170 + self.ket_prev.len()
171 + self.ket_cur.len()
172 }
173
174 /// Largest single buffer (`f64` elements). Demonstrates the former ~41 MB
175 /// monolithic VRR `[e0|f0]^(m)` table is no longer resident: the m-marching
176 /// window keeps only `3·max_k[n_cart(k)·slab_k]`.
177 #[must_use]
178 pub fn largest_buffer_f64(&self) -> usize {
179 [
180 self.levels.len(),
181 self.c_ef.len(),
182 self.bra.len(),
183 self.bra_prev.len(),
184 self.bra_cur.len(),
185 self.ket_prev.len(),
186 self.ket_cur.len(),
187 ]
188 .into_iter()
189 .max()
190 .unwrap_or(0)
191 }
192}
193
194thread_local! {
195 /// Per-thread default arena backing [`coulomb_shell_into`].
196 static ERI_SCRATCH: RefCell<EriScratch> = RefCell::new(EriScratch::new());
197}
198
199/// Grow `v` to at least `n` elements (reusing existing capacity); never shrinks.
200#[inline]
201fn ensure_len(v: &mut Vec<f64>, n: usize) {
202 if v.len() < n {
203 v.resize(n, 0.0);
204 }
205}
206
207/// Accumulate the contracted Coulomb block `(ab|cd)` for four shells into the
208/// row-major `out` block (shape `[n_cart(la)·n_cart(lb)·n_cart(lc)·n_cart(ld)]`,
209/// the same `(a,b,c,d)` layout as [`crate::rys::coulomb_into`]).
210///
211/// The engine owns the primitive-quartet loop (the HGP contraction happens between
212/// VRR and HRR), so unlike the Rys engine the driver calls this **once per shell
213/// quartet** with all primitives. This uses a **thread-local** [`EriScratch`]; for
214/// explicit control (e.g. one arena per worker thread) call
215/// [`coulomb_shell_into_scratch`].
216///
217/// # Panics
218/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
219pub fn coulomb_shell_into(
220 a: ShellRef<'_>,
221 b: ShellRef<'_>,
222 c: ShellRef<'_>,
223 d: ShellRef<'_>,
224 out: &mut [f64],
225) {
226 ERI_SCRATCH.with(|s| coulomb_shell_into_scratch(&mut s.borrow_mut(), a, b, c, d, out));
227}
228
229/// Like [`coulomb_shell_into`] but evaluates into the caller-provided
230/// [`EriScratch`], reused across quartets to avoid per-quartet heap allocation. Use
231/// **one instance per thread** (sharing a `&mut EriScratch` across threads is a
232/// compile error); the result is bit-identical regardless of which arena is used or
233/// what it last held.
234///
235/// # Panics
236/// Panics (in debug builds) if any `l > MAX_L` or `out` is too short.
237pub fn coulomb_shell_into_scratch(
238 scratch: &mut EriScratch,
239 a: ShellRef<'_>,
240 b: ShellRef<'_>,
241 c: ShellRef<'_>,
242 d: ShellRef<'_>,
243 out: &mut [f64],
244) {
245 let (la, lb, lc, ld) = (a.l, b.l, c.l, d.l);
246 debug_assert!(
247 la <= MAX_L && lb <= MAX_L && lc <= MAX_L && ld <= MAX_L,
248 "angular momentum exceeds MAX_L"
249 );
250 let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
251 debug_assert!(out.len() >= na * nb * nc * nd, "ERI output block too short");
252
253 let ne = la + lb; // max bra (A-side) degree built by VRR
254 let nf = lc + ld; // max ket (C-side) degree built by VRR
255 let l_total = ne + nf;
256 let n_e = n_addr(ne);
257 let n_f = n_addr(nf);
258
259 let EriScratch {
260 levels,
261 c_ef,
262 bra,
263 bra_prev,
264 bra_cur,
265 ket_prev,
266 ket_cur,
267 } = scratch;
268
269 // Contracted [e0|f0]^(0): zeroed per quartet (it is a `+=` accumulator).
270 ensure_len(c_ef, n_e * n_f);
271 c_ef[..n_e * n_f].fill(0.0);
272
273 // Precompute the triples for each degree once (canonical order).
274 let tri_e: Vec<Vec<[usize; 3]>> = (0..=ne).map(cart_components).collect();
275 let tri_f: Vec<Vec<[usize; 3]>> = (0..=nf).map(cart_components).collect();
276
277 // m-marching VRR scratch. Instead of the full `n_e·n_f·nm`
278 // table (~41 MB at `(ii|ii)`), keep only a rolling window of **3 consecutive
279 // f-degree levels**, each **triangle-packed** in the Boys index `m`: for an
280 // `f`-degree `k`, an `e` of degree `de` stores only `m ∈ 0..=(l_total−de−k)`.
281 // Level `k` lives in slot `k % 3` of `levels`; `slab[k]` is one `f`-component's
282 // packed size and `eoff[k][ae]` is `e`'s offset within it (see `vrr_primitive`).
283 let mut eoff: Vec<Vec<usize>> = Vec::with_capacity(nf + 1);
284 let mut slab: Vec<usize> = Vec::with_capacity(nf + 1);
285 for k in 0..=nf {
286 let mut off = Vec::with_capacity(n_e);
287 let mut run = 0usize;
288 for de in 0..=ne {
289 let mlen = l_total - de - k + 1; // ≥ 1: de + k ≤ ne + nf = l_total
290 for _ in 0..n_cart(de) {
291 off.push(run);
292 run += mlen;
293 }
294 }
295 eoff.push(off);
296 slab.push(run);
297 }
298 let maxlevel = (0..=nf).map(|k| n_cart(k) * slab[k]).max().unwrap_or(1);
299 ensure_len(levels, 3 * maxlevel);
300 // Debug guard: poison the reused VRR window so any out-of-triangle / stale read
301 // surfaces as a NaN in the output (caught by the golden + cross-engine tests),
302 // instead of silently using a previous quartet's value.
303 #[cfg(debug_assertions)]
304 levels[..3 * maxlevel].fill(f64::NAN);
305
306 for (&ea, &ca) in a.exps.iter().zip(a.coeffs.iter()) {
307 for (&eb, &cb) in b.exps.iter().zip(b.coeffs.iter()) {
308 let p = ea + eb;
309 let pc = combine(ea, a.center, eb, b.center, p);
310 let kab = (-(ea * eb / p) * dist2(a.center, b.center)).exp();
311 for (&ec, &cc) in c.exps.iter().zip(c.coeffs.iter()) {
312 for (&ed, &cd) in d.exps.iter().zip(d.coeffs.iter()) {
313 let q = ec + ed;
314 let qc = combine(ec, c.center, ed, d.center, q);
315 let kcd = (-(ec * ed / q) * dist2(c.center, d.center)).exp();
316 let scale = ca * cb * cc * cd;
317 vrr_primitive(
318 p, q, pc, qc, kab, kcd, a.center, c.center, ne, nf, l_total, n_e, n_f,
319 maxlevel, &eoff, &slab, &tri_e, &tri_f, levels, scale, c_ef,
320 );
321 }
322 }
323 }
324 }
325
326 // HRR in contracted space, then scatter the block.
327 let ab = sub(a.center, b.center); // A − B
328 let cd = sub(c.center, d.center); // C − D
329 hrr_and_scatter(
330 la, lb, lc, ld, n_f, c_ef, ab, cd, out, bra, bra_prev, bra_cur, ket_prev, ket_cur,
331 );
332}
333
334/// Build `[e0|f0]^(m)` for one primitive quartet by **m-marching** over the `f`
335/// degree and accumulate `scale · [e0|f0]^(0)` into the contracted `c_ef` table.
336///
337/// Algebraically identical to the former full-table VRR (same OS recurrences, same
338/// term order) — only the storage changes, so it is bit-identical
339/// (B0 golden snapshot). `levels` holds 3 rolling `f`-degree levels in slots
340/// `k % 3`, each `maxlevel` long; within slot `k`, element `[e0|f0]^(m)` for the
341/// `f`-component with local index `lf` (= `cart_index(f)`) lives at
342/// `(k%3)·maxlevel + lf·slab[k] + eoff[k][addr(e)] + m`. The `m`-rows are
343/// triangle-packed (`m ∈ 0..=(l_total−de−k)`), so the full `[e0|f0]^(m)` table is
344/// never resident. Each level's `m=0` slice is extracted into `c_ef` before its
345/// buffer is recycled.
346#[allow(clippy::too_many_arguments)]
347fn vrr_primitive(
348 p: f64,
349 q: f64,
350 pc: Vec3,
351 qc: Vec3,
352 kab: f64,
353 kcd: f64,
354 a_center: Vec3,
355 c_center: Vec3,
356 ne: usize,
357 nf: usize,
358 l_total: usize,
359 n_e: usize,
360 n_f: usize,
361 maxlevel: usize,
362 eoff: &[Vec<usize>],
363 slab: &[usize],
364 tri_e: &[Vec<[usize; 3]>],
365 tri_f: &[Vec<[usize; 3]>],
366 levels: &mut [f64],
367 scale: f64,
368 c_ef: &mut [f64],
369) {
370 let pq = p + q;
371 let rho = p * q / pq;
372 let pq_vec = sub(pc, qc); // P − Q
373 let t = rho * norm2(pq_vec);
374 let w = [
375 (p * pc[0] + q * qc[0]) / pq,
376 (p * pc[1] + q * qc[1]) / pq,
377 (p * pc[2] + q * qc[2]) / pq,
378 ];
379 let pa = sub(pc, a_center); // P − A
380 let wp = sub(w, pc); // W − P
381 let qcen = sub(qc, c_center); // Q − C
382 let wq = sub(w, qc); // W − Q
383
384 // Index of `[e0|f0]^(m)` (f-component local index `lf`, e at `addr(e)=ae`) in
385 // the rolling buffer for f-degree `k`. Reads/writes `levels` separately (f64 is
386 // Copy, so indexing never holds a borrow), so the same `levels` slice serves as
387 // source and destination across the distinct slots `k`, `k−1`, `k−2`.
388 let off = |k: usize, lf: usize, ae: usize, m: usize| -> usize {
389 (k % 3) * maxlevel + lf * slab[k] + eoff[k][ae] + m
390 };
391
392 // Base [00|00]^(m) = pref · F_m(T). The Boys index m reaches
393 // l_total = la+lb+lc+ld ≤ 4·MAX_L (NOT 2·MAX_L — that is the per-electron
394 // bound; an ERI couples both electrons).
395 use std::f64::consts::PI;
396 let pref = 2.0 * PI * PI * PI.sqrt() / (p * q * pq.sqrt()) * kab * kcd;
397 let mut fm = [0.0f64; 4 * MAX_L + 1];
398 boys_array(l_total, t, &mut fm[..=l_total]);
399 for m in 0..=l_total {
400 levels[off(0, 0, 0, m)] = pref * fm[m];
401 }
402
403 let inv_2p = 0.5 / p;
404 let inv_2q = 0.5 / q;
405 let inv_2pq = 0.5 / pq;
406 let q_over_pq = q / pq;
407 let p_over_pq = p / pq;
408
409 // Phase A — bra ladder (f-degree 0, level 0): build [e0|00]^(m) by raising e.
410 // Level 0 lives in slot 0, f-component 0, so the buffer offset of `[e0|00]^(m)`
411 // is `eoff0[addr(e)] + m`; the `e`-offsets are hoisted out of the hot m-loop.
412 let eoff0 = &eoff[0];
413 for (na, te_list) in tri_e.iter().enumerate().take(ne + 1).skip(1) {
414 for &te in te_list {
415 let i = lower_axis(te);
416 let s1a = addr(dec(te, i));
417 let mmax = l_total - na;
418 let coef2 = ((te[i] - 1) as f64) * inv_2p; // (e_i of source)/2p
419 let has2 = te[i] >= 2;
420 let s1_0 = eoff0[s1a];
421 let s2_0 = if has2 {
422 eoff0[addr(dec(dec(te, i), i))]
423 } else {
424 0
425 };
426 let dst_0 = eoff0[addr(te)];
427 for m in 0..=mmax {
428 let mut v = pa[i] * levels[s1_0 + m] + wp[i] * levels[s1_0 + m + 1];
429 if has2 {
430 v += coef2 * (levels[s2_0 + m] - q_over_pq * levels[s2_0 + m + 1]);
431 }
432 levels[dst_0 + m] = v;
433 }
434 }
435 }
436 // Extract level 0 (f = [0,0,0], local 0, addr 0): contract m = 0.
437 for ae in 0..n_e {
438 c_ef[ae * n_f] += scale * levels[off(0, 0, ae, 0)];
439 }
440
441 // Phase B — ket raise: march f-degree k = 1..=nf, keeping levels {k, k−1, k−2}.
442 // Slot/slab/eoff for the three active levels are hoisted out of the hot loops; the
443 // m-loop touches only `levels[base + m]` (base computed once per (tf, te)), matching
444 // the former full-table inner-loop cost. The k−2 level is read only when the axis
445 // power `tf[j] ≥ 2` (which forces k ≥ 2), so its `k−2` indices never underflow.
446 for k in 1..=nf {
447 let slot_k = (k % 3) * maxlevel;
448 let slot_k1 = ((k - 1) % 3) * maxlevel;
449 let (slab_k, slab_k1) = (slab[k], slab[k - 1]);
450 let eoff_k = &eoff[k];
451 let eoff_k1 = &eoff[k - 1];
452 let (slot_k2, slab_k2, eoff_k2): (usize, usize, &[usize]) = if k >= 2 {
453 (((k - 2) % 3) * maxlevel, slab[k - 2], &eoff[k - 2])
454 } else {
455 (0, 0, &[])
456 };
457 for &tf in &tri_f[k] {
458 let j = lower_axis(tf);
459 let f1 = dec(tf, j);
460 let lf1 = cart_index(f1); // local index in level k−1
461 let coef2 = ((tf[j] - 1) as f64) * inv_2q;
462 let has2 = tf[j] >= 2;
463 let lf2 = if has2 { cart_index(dec(f1, j)) } else { 0 }; // level k−2
464 let lf = cart_index(tf); // local index in level k
465 let sk = slot_k + lf * slab_k;
466 let sk1 = slot_k1 + lf1 * slab_k1;
467 let sk2 = slot_k2 + lf2 * slab_k2;
468 for (nadeg, te_list) in tri_e.iter().enumerate().take(ne + 1) {
469 for &te in te_list {
470 let ea = addr(te);
471 // cross term e_j/2(p+q) · [e−1_j,0|f−1_j,0]^(m+1)
472 let has_cross = te[j] >= 1;
473 let (cross_coef, cross_0) = if has_cross {
474 (te[j] as f64 * inv_2pq, sk1 + eoff_k1[addr(dec(te, j))])
475 } else {
476 (0.0, 0)
477 };
478 let mmax = l_total - nadeg - k;
479 let dst_0 = sk + eoff_k[ea];
480 let src1_0 = sk1 + eoff_k1[ea];
481 let src2_0 = if has2 { sk2 + eoff_k2[ea] } else { 0 };
482 for m in 0..=mmax {
483 let mut v = qcen[j] * levels[src1_0 + m] + wq[j] * levels[src1_0 + m + 1];
484 if has2 {
485 v += coef2 * (levels[src2_0 + m] - p_over_pq * levels[src2_0 + m + 1]);
486 }
487 if has_cross {
488 v += cross_coef * levels[cross_0 + m + 1];
489 }
490 levels[dst_0 + m] = v;
491 }
492 }
493 }
494 }
495 // Extract level k: contract each f-component's m = 0 slice into c_ef.
496 for &tf in &tri_f[k] {
497 let lf = cart_index(tf);
498 let af = addr(tf);
499 for ae in 0..n_e {
500 c_ef[ae * n_f + af] += scale * levels[off(k, lf, ae, 0)];
501 }
502 }
503 }
504}
505
506/// HRR in contracted space (bra `A→B`, then ket `C→D`) followed by scatter into
507/// the row-major output block.
508///
509/// Flat-array HGP horizontal recurrence, replacing the earlier
510/// HashMap memoisation. The recurrence math is unchanged — same `lower_axis`
511/// choice, same `(A−B)/(C−D)` geometric shifts, same `[raised] + shift·[same]`
512/// term order — so it is **bit-identical** to the recursive version (guarded by
513/// the B0 golden snapshot). Only the storage changes: contiguous arrays indexed
514/// by `addr` / [`cart_index`] instead of hashed `(triple,…)` keys.
515///
516/// The bra recurrence `(a,b|f) = (a+1_i,b−1_i|f) + (A−B)_i (a,b−1_i|f)` (axis
517/// `i = lower_axis(b)`) is built by ascending `b`-degree with two rolling layers,
518/// independently per ket index `f` (a spectator), into the `bra` intermediate.
519/// The ket recurrence `(ab|c,d) = (ab|c+1_j,d−1_j) + (C−D)_j (ab|c,d−1_j)` is the
520/// symmetric pass over `c,d`, run per `(a,b)` output pair and scattered into `out`.
521/// Resident scratch is `O(na·nb·nf_range)` for the bra intermediate plus two small
522/// rolling layers — never the dense `(a,b,f)`/`(a,b,c,d)` key space.
523#[allow(clippy::too_many_arguments)]
524fn hrr_and_scatter<'a>(
525 la: usize,
526 lb: usize,
527 lc: usize,
528 ld: usize,
529 n_f: usize,
530 c_ef: &[f64],
531 ab: Vec3,
532 cd: Vec3,
533 out: &mut [f64],
534 bra: &mut Vec<f64>,
535 mut prev: &'a mut Vec<f64>,
536 mut cur: &'a mut Vec<f64>,
537 mut kprev: &'a mut Vec<f64>,
538 mut kcur: &'a mut Vec<f64>,
539) {
540 let (na, nb, nc, nd) = (n_cart(la), n_cart(lb), n_cart(lc), n_cart(ld));
541 let ne = la + lb; // max bra (A-side) degree present in c_ef
542 let nf = lc + ld; // max ket (C-side) degree present in c_ef
543 let n_e = n_addr(ne);
544
545 // Cartesian triples by degree, reused across the recurrences.
546 let tri: Vec<Vec<[usize; 3]>> = (0..=ne.max(nf)).map(cart_components).collect();
547
548 // Ket index range actually used: f-degrees [lc..=nf]. The bra intermediate and
549 // the ket base are keyed by the global `addr(f)` offset by `f_base`.
550 let f_base = tri_below(lc);
551 let nf_range = n_f - f_base;
552
553 // --- Bra HRR: bra[(ia·nb+ib)·nf_range + (addr(f)−f_base)] = (a_ia b_ib | f 0).
554 // Reused arena buffers; bra is fully overwritten below, the rolling
555 // layers in the region read — debug NaN-fill bra so a missed write surfaces.
556 let bra_len = na * nb * nf_range;
557 ensure_len(bra, bra_len);
558 #[cfg(debug_assertions)]
559 bra[..bra_len].fill(f64::NAN);
560 // Two rolling b-degree layers, indexed [cart_index(b)·n_e + addr(a)].
561 let layer_len = n_cart(lb) * n_e;
562 ensure_len(prev, layer_len);
563 ensure_len(cur, layer_len);
564 for f_global in f_base..n_f {
565 let jf = f_global - f_base;
566 // Base b-degree 0 (one component, within-index 0): (a,0|f) = c_ef[a][f].
567 for &a in tri[la..=ne].iter().flatten() {
568 let ae = addr(a);
569 prev[ae] = c_ef[ae * n_f + f_global];
570 }
571 for kb in 1..=lb {
572 for (ibw, &b) in tri[kb].iter().enumerate() {
573 let i = lower_axis(b);
574 let b1w = cart_index(dec(b, i));
575 // a-degrees that can still reach (la, lb): [la..=ne−kb].
576 for &a in tri[la..=(ne - kb)].iter().flatten() {
577 let ae = addr(a);
578 let a1e = addr(inc(a, i));
579 cur[ibw * n_e + ae] = prev[b1w * n_e + a1e] + ab[i] * prev[b1w * n_e + ae];
580 }
581 }
582 std::mem::swap(&mut prev, &mut cur);
583 }
584 // `prev` now holds b-degree lb (or the base when lb = 0): extract.
585 for (ib, &b) in tri[lb].iter().enumerate() {
586 let ibw = cart_index(b); // == ib (tri[lb] is in cart order)
587 for (ia, &a) in tri[la].iter().enumerate() {
588 bra[(ia * nb + ib) * nf_range + jf] = prev[ibw * n_e + addr(a)];
589 }
590 }
591 }
592
593 // --- Ket HRR: per (ia,ib), build (c,d) and scatter into out.
594 let klayer_len = n_cart(ld) * n_f;
595 ensure_len(kprev, klayer_len);
596 ensure_len(kcur, klayer_len);
597 for ia in 0..na {
598 for ib in 0..nb {
599 let brarow = (ia * nb + ib) * nf_range;
600 // Base d-degree 0: (ab|c,0) = bra[ia][ib][c], c-degrees [lc..=nf].
601 for &c in tri[lc..=nf].iter().flatten() {
602 let ce = addr(c);
603 kprev[ce] = bra[brarow + (ce - f_base)];
604 }
605 for kd in 1..=ld {
606 for (idw, &d) in tri[kd].iter().enumerate() {
607 let j = lower_axis(d);
608 let d1w = cart_index(dec(d, j));
609 for &c in tri[lc..=(nf - kd)].iter().flatten() {
610 let ce = addr(c);
611 let c1e = addr(inc(c, j));
612 kcur[idw * n_f + ce] =
613 kprev[d1w * n_f + c1e] + cd[j] * kprev[d1w * n_f + ce];
614 }
615 }
616 std::mem::swap(&mut kprev, &mut kcur);
617 }
618 // `kprev` now holds d-degree ld (or the base when ld = 0): scatter into out.
619 for (ic, &c) in tri[lc].iter().enumerate() {
620 let ce = addr(c);
621 for id in 0..nd {
622 out[((ia * nb + ib) * nc + ic) * nd + id] += kprev[id * n_f + ce];
623 }
624 }
625 }
626 }
627}
628
629// --- small helpers ---
630
631/// First nonzero axis of a triple (the lowering direction).
632#[inline]
633fn lower_axis(t: [usize; 3]) -> usize {
634 if t[0] > 0 {
635 0
636 } else if t[1] > 0 {
637 1
638 } else {
639 2
640 }
641}
642
643#[inline]
644fn dec(mut t: [usize; 3], i: usize) -> [usize; 3] {
645 t[i] -= 1;
646 t
647}
648
649#[inline]
650fn inc(mut t: [usize; 3], i: usize) -> [usize; 3] {
651 t[i] += 1;
652 t
653}
654
655#[inline]
656fn combine(a: f64, ca: Vec3, b: f64, cb: Vec3, p: f64) -> Vec3 {
657 [
658 (a * ca[0] + b * cb[0]) / p,
659 (a * ca[1] + b * cb[1]) / p,
660 (a * ca[2] + b * cb[2]) / p,
661 ]
662}
663
664#[inline]
665fn sub(u: Vec3, v: Vec3) -> Vec3 {
666 [u[0] - v[0], u[1] - v[1], u[2] - v[2]]
667}
668
669#[inline]
670fn dist2(u: Vec3, v: Vec3) -> f64 {
671 norm2(sub(u, v))
672}
673
674#[inline]
675fn norm2(u: Vec3) -> f64 {
676 u[0] * u[0] + u[1] * u[1] + u[2] * u[2]
677}
678
679#[cfg(test)]
680mod tests {
681 use super::*;
682
683 /// Single-primitive `ShellRef` helper.
684 fn s(center: Vec3, l: usize, exp: f64) -> (Vec3, usize, [f64; 1], [f64; 1]) {
685 (center, l, [exp], [1.0])
686 }
687
688 /// (ss|ss) over four unit s primitives must equal the closed form
689 /// `2π^{5/2}/(p q √(p+q)) · K_ab · K_cd · F_0(T)` — the same check the Rys
690 /// engine passes, so the two share a verified base case.
691 #[test]
692 fn ssss_matches_closed_form() {
693 let (ac, al, ae, acf) = s([0.0, 0.0, 0.0], 0, 0.8);
694 let (bc, bl, be, bcf) = s([0.0, 0.0, 0.7], 0, 1.3);
695 let (cc, cl, ce, ccf) = s([0.4, 0.0, 0.0], 0, 0.5);
696 let (dc, dl, de, dcf) = s([0.0, 0.6, 0.2], 0, 1.1);
697 let mut out = [0.0; 1];
698 coulomb_shell_into(
699 ShellRef {
700 center: ac,
701 l: al,
702 exps: &ae,
703 coeffs: &acf,
704 },
705 ShellRef {
706 center: bc,
707 l: bl,
708 exps: &be,
709 coeffs: &bcf,
710 },
711 ShellRef {
712 center: cc,
713 l: cl,
714 exps: &ce,
715 coeffs: &ccf,
716 },
717 ShellRef {
718 center: dc,
719 l: dl,
720 exps: &de,
721 coeffs: &dcf,
722 },
723 &mut out,
724 );
725
726 let p = 0.8 + 1.3;
727 let q = 0.5 + 1.1;
728 let pcen = combine(0.8, ac, 1.3, bc, p);
729 let qcen = combine(0.5, cc, 1.1, dc, q);
730 let kab = (-(0.8 * 1.3 / p) * dist2(ac, bc)).exp();
731 let kcd = (-(0.5 * 1.1 / q) * dist2(cc, dc)).exp();
732 let rho = p * q / (p + q);
733 let t = rho * dist2(pcen, qcen);
734 let mut fm = [0.0; 1];
735 boys_array(0, t, &mut fm);
736 use std::f64::consts::PI;
737 let expect = 2.0 * PI * PI * PI.sqrt() / (p * q * (p + q).sqrt()) * kab * kcd * fm[0];
738 assert!(
739 (out[0] - expect).abs() < 1e-14 * expect.abs(),
740 "ssss {} vs {}",
741 out[0],
742 expect
743 );
744 }
745
746 use crate::os::Prim;
747 use crate::rys::coulomb_into;
748
749 /// Build a single-primitive OS/HGP block for a quartet of given `(l, exp,
750 /// center)`, for cross-checking against the Rys engine.
751 #[allow(clippy::too_many_arguments)]
752 fn os_block(
753 la: usize,
754 ea: f64,
755 ca: Vec3,
756 lb: usize,
757 eb: f64,
758 cb: Vec3,
759 lc: usize,
760 recc: f64,
761 ccc: Vec3,
762 ld: usize,
763 ed: f64,
764 cdd: Vec3,
765 ) -> Vec<f64> {
766 let mut out = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
767 let (ea1, eb1, ec1, ed1) = ([ea], [eb], [recc], [ed]);
768 let one = [1.0];
769 coulomb_shell_into(
770 ShellRef {
771 center: ca,
772 l: la,
773 exps: &ea1,
774 coeffs: &one,
775 },
776 ShellRef {
777 center: cb,
778 l: lb,
779 exps: &eb1,
780 coeffs: &one,
781 },
782 ShellRef {
783 center: ccc,
784 l: lc,
785 exps: &ec1,
786 coeffs: &one,
787 },
788 ShellRef {
789 center: cdd,
790 l: ld,
791 exps: &ed1,
792 coeffs: &one,
793 },
794 &mut out,
795 );
796 out
797 }
798
799 /// OS/HGP must reproduce the Rys engine element-for-element on a single
800 /// primitive quartet, across a sweep of angular momenta (including the
801 /// bug-prone mixed-high-L and "all four different L on four centers" cases).
802 #[test]
803 fn matches_rys_engine_primitive_sweep() {
804 let ca = [0.0, 0.0, 0.0];
805 let cb = [0.5, -0.3, 0.2];
806 let cc = [-0.4, 0.6, -0.1];
807 let cd = [0.2, 0.4, 0.8];
808 let (ea, eb, ec, ed) = (0.9, 1.3, 0.7, 1.1);
809
810 let quartets = [
811 (0usize, 0usize, 0usize, 0usize),
812 (1, 0, 0, 0),
813 (0, 0, 1, 0),
814 (1, 1, 0, 0),
815 (1, 0, 1, 0),
816 (1, 1, 1, 1),
817 (2, 0, 0, 0),
818 (2, 1, 0, 0),
819 (2, 1, 2, 1),
820 (0, 1, 2, 3), // four different L on four centers
821 (2, 2, 3, 3), // (dd|ff) mixed high-L
822 (3, 0, 0, 1),
823 // l_total ≥ 13 guards: the Boys aux index m reaches la+lb+lc+ld, which
824 // exceeds 2·MAX_L. These panicked the under-sized fm buffer (a real bug
825 // a ≤(dd|ff) sweep missed) and are kept as permanent regression guards.
826 (4, 4, 4, 1), // l_total = 13
827 (6, 6, 1, 0), // l_total = 13, two i-shells
828 (3, 3, 3, 3), // ffff: the cancellation-heavy mixed case
829 ];
830 for (la, lb, lc, ld) in quartets {
831 let os = os_block(la, ea, ca, lb, eb, cb, lc, ec, cc, ld, ed, cd);
832 let mut rys = vec![0.0; os.len()];
833 coulomb_into(
834 Prim::new(ea, ca, la),
835 Prim::new(eb, cb, lb),
836 Prim::new(ec, cc, lc),
837 Prim::new(ed, cd, ld),
838 1.0,
839 &mut rys,
840 );
841 assert_cross_engine_close(&os, &rys, &format!("({la}{lb}|{lc}{ld})"));
842 }
843 }
844
845 /// Assert two ERI blocks (here OS/HGP vs Rys, two independent f64 recurrences)
846 /// agree under `|o − r| ≤ atol + rtol·|r|` with `atol = 1e-11`, `rtol = 1e-10`.
847 /// The atol floor absorbs benign near-cancellation on structurally tiny
848 /// components (where the *relative* OS/Rys difference reaches ~1e-8 at high L
849 /// even though both engines are correct — their dominant elements agree to
850 /// ~1e-12); the rtol catches any real divergence.
851 fn assert_cross_engine_close(os: &[f64], rys: &[f64], tag: &str) {
852 const ATOL: f64 = 1e-11;
853 const RTOL: f64 = 1e-10;
854 for (o, r) in os.iter().zip(rys.iter()) {
855 assert!(
856 (o - r).abs() <= ATOL + RTOL * r.abs(),
857 "{tag} OS vs Rys mismatch: {o} vs {r} (Δ={:e})",
858 (o - r).abs()
859 );
860 }
861 }
862
863 /// HGP early contraction (VRR per primitive, HRR once in contracted space)
864 /// must equal the per-primitive Rys sum for a genuinely **contracted**
865 /// quartet.
866 ///
867 /// Note (`DESIGN_NOTES.md` D6): doing the HRR after contraction is a *performance*
868 /// choice, **not** a correctness fork. The HRR operator is linear with
869 /// exponent-independent geometric coefficients (`A−B`, `C−D`), so
870 /// `HRR(Σ_p w_p·[e0|f0]_p) = Σ_p w_p·HRR([e0|f0]_p)` identically — per-primitive
871 /// and post-contraction HRR give the *same* answer. This remains a valuable
872 /// end-to-end check (a broken VRR cross-term, a mis-applied contraction
873 /// coefficient, or a wrong HRR sign would all fail it), but it does not
874 /// distinguish HRR *ordering*, because the two orders are algebraically equal.
875 #[test]
876 fn contracted_quartet_matches_rys_sum() {
877 // Two p shells and a d shell, each with multiple primitives, on distinct
878 // centres (so HRR shift vectors A−B, C−D are all non-trivial).
879 let ca = [0.0, 0.0, 0.0];
880 let cb = [0.6, -0.2, 0.1];
881 let cc = [-0.3, 0.5, -0.2];
882 let cd = [0.2, 0.3, 0.7];
883 let (la, lb, lc, ld) = (1usize, 1usize, 2usize, 0usize);
884 let ax = [1.4, 0.45];
885 let acf = [0.6, 0.5];
886 let bx = [0.9, 0.3];
887 let bcf = [0.55, 0.5];
888 let cx = [1.1, 0.4];
889 let ccf = [0.7, 0.4];
890 let dx = [0.8];
891 let dcf = [1.0];
892
893 let mut os = vec![0.0; n_cart(la) * n_cart(lb) * n_cart(lc) * n_cart(ld)];
894 coulomb_shell_into(
895 ShellRef {
896 center: ca,
897 l: la,
898 exps: &ax,
899 coeffs: &acf,
900 },
901 ShellRef {
902 center: cb,
903 l: lb,
904 exps: &bx,
905 coeffs: &bcf,
906 },
907 ShellRef {
908 center: cc,
909 l: lc,
910 exps: &cx,
911 coeffs: &ccf,
912 },
913 ShellRef {
914 center: cd,
915 l: ld,
916 exps: &dx,
917 coeffs: &dcf,
918 },
919 &mut os,
920 );
921
922 // Reference: sum the Rys primitive engine over the quartet with the same
923 // effective coefficients.
924 let mut rys = vec![0.0; os.len()];
925 for (&ea, &wa) in ax.iter().zip(acf.iter()) {
926 for (&eb, &wb) in bx.iter().zip(bcf.iter()) {
927 for (&ec, &wc) in cx.iter().zip(ccf.iter()) {
928 for (&ed, &wd) in dx.iter().zip(dcf.iter()) {
929 coulomb_into(
930 Prim::new(ea, ca, la),
931 Prim::new(eb, cb, lb),
932 Prim::new(ec, cc, lc),
933 Prim::new(ed, cd, ld),
934 wa * wb * wc * wd,
935 &mut rys,
936 );
937 }
938 }
939 }
940 }
941
942 for (o, r) in os.iter().zip(rys.iter()) {
943 assert!(
944 (o - r).abs() <= 1e-10 * r.abs().max(1e-12),
945 "contracted OS vs Rys mismatch: {o} vs {r}"
946 );
947 }
948 }
949
950 /// The triple-address map must be a bijection onto `0..n_addr(L)`.
951 #[test]
952 fn addr_is_bijective() {
953 for lmax in 0..=6 {
954 let mut seen = vec![false; n_addr(lmax)];
955 for n in 0..=lmax {
956 for t in cart_components(n) {
957 let a = addr(t);
958 assert!(a < n_addr(lmax), "addr {a} out of range for lmax {lmax}");
959 assert!(!seen[a], "addr collision at {t:?}");
960 seen[a] = true;
961 }
962 }
963 assert!(
964 seen.iter().all(|&x| x),
965 "addr not surjective at lmax {lmax}"
966 );
967 }
968 }
969}