integral 0.1.1 - Docs.rs

//! One- and two-electron integral builders over a [`Basis`].
//!
//! Each one-electron builder loops over shell pairs, contracts primitives with
//! their effective coefficients (`d_i · N(α_i, l)`), and places the resulting
//! Cartesian block into a dense `nao × nao` matrix (row-major). Dipole returns
//! three such matrices for the `x`, `y`, `z` components about a chosen origin.
//!
//! The two-electron builder ([`Basis::eri`]) produces the dense Cartesian
//! repulsion tensor via the Rys engine; see its docs for the layout.

use integral_core::os_eri::{self, ShellRef};
use integral_core::{os, rys};

use crate::shell::{Basis, Shell};
use crate::spherical::{shell_transform, transform_block};

/// Transform a contracted Cartesian one-electron block (`na_cart × nb_cart`,
/// row-major) into the function-space block (`na_func × nb_func`) by applying
/// each shell's `c2s` transform (identity for Cartesian shells).
pub(crate) fn to_func_1e(block: Vec<f64>, sa: &Shell, sb: &Shell) -> Vec<f64> {
    let mats = [shell_transform(sa), shell_transform(sb)];
    transform_block(
        block,
        &[sa.n_cart(), sb.n_cart()],
        &[mats[0].as_deref(), mats[1].as_deref()],
    )
}

/// Transform a contracted Cartesian ERI quartet block into function space by
/// applying each of the four shells' `c2s` transforms, computed on the spot.
/// The dense `O(n⁴)` builders precompute the transforms once per shell and call
/// [`to_func_eri_cached`] instead (building a `c2s` matrix costs far more than
/// applying it).
pub(crate) fn to_func_eri(
    block: Vec<f64>,
    sa: &Shell,
    sb: &Shell,
    sc: &Shell,
    sd: &Shell,
) -> Vec<f64> {
    let mats = [
        shell_transform(sa),
        shell_transform(sb),
        shell_transform(sc),
        shell_transform(sd),
    ];
    to_func_eri_cached(
        block,
        [sa, sb, sc, sd],
        [
            mats[0].as_deref(),
            mats[1].as_deref(),
            mats[2].as_deref(),
            mats[3].as_deref(),
        ],
    )
}

/// Like [`to_func_eri`] but with each shell's transform supplied by the caller
/// (`None` = Cartesian shell, identity). Results are identical.
fn to_func_eri_cached(block: Vec<f64>, s: [&Shell; 4], mats: [Option<&[f64]>; 4]) -> Vec<f64> {
    transform_block(
        block,
        &[s[0].n_cart(), s[1].n_cart(), s[2].n_cart(), s[3].n_cart()],
        &mats,
    )
}

/// Which two-electron engine evaluates an ERI quartet.
///
/// Correctness is **engine-transparent**: both engines compute the same Coulomb
/// integral to the documented tolerance, so [`Engine::OsHgp`] and [`Engine::Rys`]
/// can be forced (e.g. from tests/CI) to exercise both paths on the same cases.
/// [`Engine::Auto`] applies the dispatch policy ([`select_engine`]).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum Engine {
    /// Dispatch by `(total angular momentum, contraction degree)`.
    #[default]
    Auto,
    /// Force the Obara–Saika / Head-Gordon–Pople engine (low-L, high-contraction).
    OsHgp,
    /// Force the Rys-quadrature engine (general; the high-L fallback).
    Rys,
}

/// Dispatch policy: pick OS/HGP vs Rys from a quartet's total angular momentum
/// `l_total = la+lb+lc+ld` and its primitive-quartet `contraction_degree`
/// (`n_prim_a·n_prim_b·n_prim_c·n_prim_d`).
///
/// HGP's per-primitive VRR is cheaper than Rys's per-primitive roots/weights and
/// its geometry-only HRR is amortised once per shell quartet, so it wins at low L
/// and/or high contraction; Rys's small-footprint 2D recurrences win as L grows
/// and the HGP VRR/HRR tables blow up.
///
/// The crossover thresholds are calibrated from an on-host benchmark (see
/// `DESIGN_NOTES.md` D13) measured with the current Rys-roots interpolation (D11)
/// and the OS/HGP flat-HRR / m-marching engine (D12). The crossover sits markedly
/// toward OS/HGP: it wins down to `deg 1` for `l_total ≤ 4`, and once `deg ≥ 16`
/// all the way up to `l_total 16` —
///
/// | `l_total` | OS/HGP when `contraction_degree ≥` |
/// |-----------|-----------------------------------|
/// | 0‥=4      | 1   (always OS)                   |
/// | 5‥=16     | 16  (≈ K ≥ 2 per shell)           |
/// | ≥ 17      | never (Rys)                       |
///
/// Each band is backed by measured medians (D13): `l_total 0–4` — L0_K1 OS 2.3×,
/// L1_K1 OS 1.01× (tie); `l_total 5–16` — L2/L3/L4 deg-1 Rys, deg-16 OS 1.64–2.42×;
/// `l_total ≥ 17` — L5_K1/L6_K1 Rys (the high-contraction corner there is unmeasured,
/// so the cap stays). The thresholds are **calibrated to this engine state**; a
/// future change to either engine's constant re-opens the calibration.
#[must_use]
pub fn select_engine(l_total: usize, contraction_degree: usize) -> Engine {
    let threshold = match l_total {
        0..=4 => 1,              // OS wins (or ties at L1_K1) down to deg 1
        5..=16 => 16,            // OS once contraction clears deg 16 (K≥2/shell); Rys below
        _ => return Engine::Rys, // l_total ≥ 17: Rys (no measured OS win at high deg)
    };
    if contraction_degree >= threshold {
        Engine::OsHgp
    } else {
        Engine::Rys
    }
}

/// Place a row-major `na × nb` block at `(row_off, col_off)` in a row-major
/// `n × n` matrix.
pub(crate) fn place_block(
    mat: &mut [f64],
    n: usize,
    row_off: usize,
    col_off: usize,
    block: &[f64],
    nb: usize,
) {
    let na = block.len() / nb;
    for i in 0..na {
        for j in 0..nb {
            mat[(row_off + i) * n + col_off + j] = block[i * nb + j];
        }
    }
}

/// Contract one shell pair into a fresh `na × nb` block using `prim_op`, which
/// accumulates `scale · ⟨a|·|b⟩` for a primitive pair into the block.
fn contract_pair<F>(sa: &Shell, sb: &Shell, mut prim_op: F) -> Vec<f64>
where
    F: FnMut(os::Prim, os::Prim, f64, &mut [f64]),
{
    let mut block = vec![0.0; sa.n_cart() * sb.n_cart()];
    for pi in 0..sa.n_prim() {
        for pj in 0..sb.n_prim() {
            let scale = sa.primitive_coeff(pi) * sb.primitive_coeff(pj);
            prim_op(sa.prim(pi), sb.prim(pj), scale, &mut block);
        }
    }
    block
}

impl Basis {
    /// Overlap matrix `S_{μν} = ⟨μ|ν⟩`.
    #[must_use]
    pub fn overlap(&self) -> Vec<f64> {
        let n = self.nao();
        let offs = self.offsets();
        let mut mat = vec![0.0; n * n];
        for (si, sa) in self.shells().iter().enumerate() {
            for (sj, sb) in self.shells().iter().enumerate() {
                let block = to_func_1e(contract_pair(sa, sb, os::overlap_into), sa, sb);
                place_block(&mut mat, n, offs[si], offs[sj], &block, sb.n_func());
            }
        }
        mat
    }

    /// Kinetic-energy matrix `T_{μν} = ⟨μ| -½∇² |ν⟩`.
    #[must_use]
    pub fn kinetic(&self) -> Vec<f64> {
        let n = self.nao();
        let offs = self.offsets();
        let mut mat = vec![0.0; n * n];
        for (si, sa) in self.shells().iter().enumerate() {
            for (sj, sb) in self.shells().iter().enumerate() {
                let block = to_func_1e(contract_pair(sa, sb, os::kinetic_into), sa, sb);
                place_block(&mut mat, n, offs[si], offs[sj], &block, sb.n_func());
            }
        }
        mat
    }

    /// Nuclear-attraction matrix `V_{μν} = Σ_C ⟨μ| −Z_C/|r−C| |ν⟩` for the given
    /// point charges `charges = [(center, Z)]`.
    #[must_use]
    pub fn nuclear(&self, charges: &[([f64; 3], f64)]) -> Vec<f64> {
        let n = self.nao();
        let offs = self.offsets();
        let mut mat = vec![0.0; n * n];
        for (si, sa) in self.shells().iter().enumerate() {
            for (sj, sb) in self.shells().iter().enumerate() {
                let block = to_func_1e(
                    contract_pair(sa, sb, |a, b, scale, out| {
                        os::nuclear_into(a, b, charges, scale, out);
                    }),
                    sa,
                    sb,
                );
                place_block(&mut mat, n, offs[si], offs[sj], &block, sb.n_func());
            }
        }
        mat
    }

    /// Cartesian dipole matrices `[D_x, D_y, D_z]`, `D_k = ⟨μ| (r−O)_k |ν⟩`,
    /// about the origin `o`.
    #[must_use]
    pub fn dipole(&self, o: [f64; 3]) -> [Vec<f64>; 3] {
        let n = self.nao();
        let offs = self.offsets();
        let mut dx = vec![0.0; n * n];
        let mut dy = vec![0.0; n * n];
        let mut dz = vec![0.0; n * n];
        for (si, sa) in self.shells().iter().enumerate() {
            for (sj, sb) in self.shells().iter().enumerate() {
                let (na, nb) = (sa.n_cart(), sb.n_cart());
                let (mut bx, mut by, mut bz) =
                    (vec![0.0; na * nb], vec![0.0; na * nb], vec![0.0; na * nb]);
                for pi in 0..sa.n_prim() {
                    for pj in 0..sb.n_prim() {
                        let scale = sa.primitive_coeff(pi) * sb.primitive_coeff(pj);
                        os::dipole_into(
                            sa.prim(pi),
                            sb.prim(pj),
                            o,
                            scale,
                            &mut bx,
                            &mut by,
                            &mut bz,
                        );
                    }
                }
                let bx = to_func_1e(bx, sa, sb);
                let by = to_func_1e(by, sa, sb);
                let bz = to_func_1e(bz, sa, sb);
                let nbf = sb.n_func();
                place_block(&mut dx, n, offs[si], offs[sj], &bx, nbf);
                place_block(&mut dy, n, offs[si], offs[sj], &by, nbf);
                place_block(&mut dz, n, offs[si], offs[sj], &bz, nbf);
            }
        }
        [dx, dy, dz]
    }

    /// Contracted Cartesian ERI block for the four shells `(i, j, k, l)` in
    /// chemists' notation `(ij|kl) = ∫∫ φ_i(1)φ_j(1) r₁₂⁻¹ φ_k(2)φ_l(2) d1 d2`.
    ///
    /// The returned block is **row-major over the four Cartesian component
    /// indices** `(a, b, c, d)` of shells `(i, j, k, l)`:
    ///
    /// ```text
    ///   block[((a · n_j + b) · n_k + c) · n_l + d]
    /// ```
    ///
    /// with `n_x = self.shells()[x].n_func()` (`n_cart` for a Cartesian shell,
    /// `2l+1` for a spherical one) and the Cartesian component order of
    /// `integral_math::am` (or the `integral_math::solid_harmonics::m_order` spherical
    /// order for spherical shells) — the
    /// fastest-varying index is `d`, slowest is `a`. The block length is
    /// `n_i · n_j · n_k · n_l`.
    #[must_use]
    pub fn eri_block(&self, i: usize, j: usize, k: usize, l: usize) -> Vec<f64> {
        self.eri_block_with(Engine::Auto, i, j, k, l)
    }

    /// Like [`Basis::eri_block`] but forces a specific [`Engine`] (or [`Engine::Auto`]
    /// for the dispatch policy). Both engines produce the same block to tolerance;
    /// forcing exists so tests/CI exercise each path on the same quartets.
    #[must_use]
    pub fn eri_block_with(
        &self,
        engine: Engine,
        i: usize,
        j: usize,
        k: usize,
        l: usize,
    ) -> Vec<f64> {
        let s = self.shells();
        let (sa, sb, sc, sd) = (&s[i], &s[j], &s[k], &s[l]);
        let block = contract_quartet(engine, sa, sb, sc, sd);
        // Spherical shells are transformed to their `2l+1` components; Cartesian
        // shells pass through unchanged. Block dims become per-shell `n_func`.
        to_func_eri(block, sa, sb, sc, sd)
    }

    /// Dense electron-repulsion tensor `(ij|kl)` over the whole basis, in
    /// chemists' notation. Shells declared [`crate::ShellKind::Spherical`]
    /// contribute their `2l+1` spherical components; Cartesian shells their
    /// `n_cart`.
    ///
    /// Shape `[nao, nao, nao, nao]` flattened **row-major**:
    ///
    /// ```text
    ///   eri[((i · nao + j) · nao + k) · nao + l] = (ij|kl)
    /// ```
    ///
    /// where `nao = self.nao()` and `i, j, k, l` are global AO indices (shell
    /// blocks placed at the offsets from `offsets()`). The tensor obeys the
    /// 8-fold permutational symmetry `(ij|kl) = (ji|kl) = (ij|lk) = (kl|ij) = …`.
    ///
    /// This is the unscreened `O(nao⁴)` build; screening and the HGP/dispatch
    /// fast paths are later phases.
    #[must_use]
    pub fn eri(&self) -> Vec<f64> {
        self.eri_with(Engine::Auto)
    }

    /// Like [`Basis::eri`] but forces a specific [`Engine`] (or [`Engine::Auto`]).
    /// Both engines produce the same tensor to tolerance.
    #[must_use]
    pub fn eri_with(&self, engine: Engine) -> Vec<f64> {
        let nao = self.nao();
        let offs = self.offsets();
        let shells = self.shells();
        // Effective coefficients depend only on the shell; compute once per shell
        // instead of re-running `cart_norm` (a `powf`) for all four shells of every
        // one of the `n_shells⁴` quartets. Same for the `c2s` transforms: building
        // one runs the Racah-coefficient/normalization machinery, so rebuilding
        // them per quartet dominated the spherical driver (~35% of a cc-pVDZ build).
        let eff: Vec<Vec<f64>> = shells.iter().map(effective_coeffs).collect();
        let c2s: Vec<Option<Vec<f64>>> = shells.iter().map(shell_transform).collect();
        let mut out = vec![0.0; nao * nao * nao * nao];
        for (si, sa) in shells.iter().enumerate() {
            for (sj, sb) in shells.iter().enumerate() {
                for (sk, sc) in shells.iter().enumerate() {
                    for (sl, sd) in shells.iter().enumerate() {
                        let block = to_func_eri_cached(
                            contract_quartet_cached(
                                engine, sa, &eff[si], sb, &eff[sj], sc, &eff[sk], sd, &eff[sl],
                            ),
                            [sa, sb, sc, sd],
                            [
                                c2s[si].as_deref(),
                                c2s[sj].as_deref(),
                                c2s[sk].as_deref(),
                                c2s[sl].as_deref(),
                            ],
                        );
                        place_eri_block(
                            &mut out,
                            nao,
                            [offs[si], offs[sj], offs[sk], offs[sl]],
                            [sa.n_func(), sb.n_func(), sc.n_func(), sd.n_func()],
                            &block,
                        );
                    }
                }
            }
        }
        out
    }

    /// Cauchy–Schwarz shell-pair bound matrix `Q` (Häser–Ahlrichs 1989), row-major
    /// `n_shells × n_shells`:
    ///
    /// ```text
    ///   Q[i, j] = sqrt( max_{μ∈i, ν∈j} (μν|μν) ).
    /// ```
    ///
    /// Each diagonal self-repulsion `(μν|μν) ≥ 0` is read from the `(ij|ij)` shell
    /// block, so `Q` bounds every ERI by `|(μν|λσ)| ≤ Q[i,j]·Q[k,l]` for `μν` in
    /// shell pair `(i,j)` and `λσ` in `(k,l)`. Kind-aware: spherical shells use
    /// their `2l+1` components, so `Q` bounds the spherical integrals directly.
    #[must_use]
    pub fn schwarz_bounds(&self) -> Vec<f64> {
        self.schwarz_bounds_with(Engine::Auto)
    }

    /// Like [`Basis::schwarz_bounds`] but with a forced [`Engine`] (the diagonal
    /// blocks are evaluated with it). The bound is engine-independent to tolerance.
    #[must_use]
    pub fn schwarz_bounds_with(&self, engine: Engine) -> Vec<f64> {
        let shells = self.shells();
        let nsh = shells.len();
        let mut q = vec![0.0; nsh * nsh];
        for i in 0..nsh {
            for j in 0..nsh {
                let (ni, nj) = (shells[i].n_func(), shells[j].n_func());
                let block = self.eri_block_with(engine, i, j, i, j);
                let mut mx = 0.0_f64;
                for mu in 0..ni {
                    for nu in 0..nj {
                        // Diagonal element (μν|μν) of the (ij|ij) block.
                        let idx = ((mu * nj + nu) * ni + mu) * nj + nu;
                        mx = mx.max(block[idx].abs());
                    }
                }
                q[i * nsh + j] = mx.sqrt();
            }
        }
        q
    }

    /// Schwarz-screened dense ERI tensor: identical to [`Basis::eri`] except a
    /// shell quartet `(ij|kl)` is **skipped** (left zero) when its Cauchy–Schwarz
    /// bound `Q[i,j]·Q[k,l] < τ` (`tau`). Because every element of a skipped block
    /// satisfies `|(μν|λσ)| ≤ Q[i,j]·Q[k,l] < τ`, screening introduces **no error
    /// above `τ`**. Returns the tensor and [`ScreeningStats`].
    ///
    /// `tau` is the documented screening threshold; smaller `τ` retains more
    /// quartets (more accurate, slower). A typical production value is `1e-10`–
    /// `1e-12`.
    #[must_use]
    pub fn eri_screened(&self, tau: f64) -> (Vec<f64>, ScreeningStats) {
        self.eri_screened_with(Engine::Auto, tau)
    }

    /// Like [`Basis::eri_screened`] but with a forced [`Engine`].
    #[must_use]
    pub fn eri_screened_with(&self, engine: Engine, tau: f64) -> (Vec<f64>, ScreeningStats) {
        let nao = self.nao();
        let offs = self.offsets();
        let shells = self.shells();
        let nsh = shells.len();
        let q = self.schwarz_bounds_with(engine);
        let eff: Vec<Vec<f64>> = shells.iter().map(effective_coeffs).collect();
        let c2s: Vec<Option<Vec<f64>>> = shells.iter().map(shell_transform).collect();
        let mut out = vec![0.0; nao * nao * nao * nao];
        let mut total = 0_usize;
        let mut skipped = 0_usize;
        for si in 0..nsh {
            for sj in 0..nsh {
                let qij = q[si * nsh + sj];
                for sk in 0..nsh {
                    for sl in 0..nsh {
                        total += 1;
                        if qij * q[sk * nsh + sl] < tau {
                            skipped += 1;
                            continue;
                        }
                        let block = to_func_eri_cached(
                            contract_quartet_cached(
                                engine,
                                &shells[si],
                                &eff[si],
                                &shells[sj],
                                &eff[sj],
                                &shells[sk],
                                &eff[sk],
                                &shells[sl],
                                &eff[sl],
                            ),
                            [&shells[si], &shells[sj], &shells[sk], &shells[sl]],
                            [
                                c2s[si].as_deref(),
                                c2s[sj].as_deref(),
                                c2s[sk].as_deref(),
                                c2s[sl].as_deref(),
                            ],
                        );
                        place_eri_block(
                            &mut out,
                            nao,
                            [offs[si], offs[sj], offs[sk], offs[sl]],
                            [
                                shells[si].n_func(),
                                shells[sj].n_func(),
                                shells[sk].n_func(),
                                shells[sl].n_func(),
                            ],
                            &block,
                        );
                    }
                }
            }
        }
        (
            out,
            ScreeningStats {
                shell_quartets_total: total,
                shell_quartets_skipped: skipped,
                tau,
            },
        )
    }
}

/// Outcome of a Schwarz-screened ERI build ([`Basis::eri_screened`]).
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct ScreeningStats {
    /// Total shell quartets considered (`n_shells⁴`).
    pub shell_quartets_total: usize,
    /// Shell quartets skipped by the Schwarz bound.
    pub shell_quartets_skipped: usize,
    /// The screening threshold `τ` used.
    pub tau: f64,
}

impl ScreeningStats {
    /// Fraction of shell quartets skipped, in `[0, 1]`.
    #[must_use]
    pub fn skipped_fraction(&self) -> f64 {
        if self.shell_quartets_total == 0 {
            0.0
        } else {
            self.shell_quartets_skipped as f64 / self.shell_quartets_total as f64
        }
    }
}

/// Contract one shell quartet into a fresh `na·nb·nc·nd` block (row-major over
/// the four Cartesian component indices), dispatching to the requested engine.
/// [`Engine::Auto`] resolves via [`select_engine`]. Computes each shell's effective
/// coefficients on the spot; the dense `O(nao⁴)` builders precompute them once per
/// shell and call [`contract_quartet_cached`] instead.
fn contract_quartet(engine: Engine, sa: &Shell, sb: &Shell, sc: &Shell, sd: &Shell) -> Vec<f64> {
    contract_quartet_cached(
        engine,
        sa,
        &effective_coeffs(sa),
        sb,
        &effective_coeffs(sb),
        sc,
        &effective_coeffs(sc),
        sd,
        &effective_coeffs(sd),
    )
}

/// Like [`contract_quartet`] but with each shell's effective coefficients supplied
/// by the caller (e.g. precomputed once per shell across a dense build), avoiding
/// the per-quartet `cart_norm`/allocation. Results are identical.
#[allow(clippy::too_many_arguments)]
fn contract_quartet_cached(
    engine: Engine,
    sa: &Shell,
    ea: &[f64],
    sb: &Shell,
    eb: &[f64],
    sc: &Shell,
    ec: &[f64],
    sd: &Shell,
    ed: &[f64],
) -> Vec<f64> {
    let resolved = match engine {
        Engine::Auto => select_engine(
            sa.l() + sb.l() + sc.l() + sd.l(),
            sa.n_prim() * sb.n_prim() * sc.n_prim() * sd.n_prim(),
        ),
        forced => forced,
    };
    match resolved {
        Engine::OsHgp => contract_quartet_oshgp(sa, ea, sb, eb, sc, ec, sd, ed),
        // `Auto` is resolved above; treat anything else as Rys.
        _ => contract_quartet_rys(sa, ea, sb, eb, sc, ec, sd, ed),
    }
}

/// Effective contraction coefficients `d_i · N(α_i, l)` of a shell, in primitive
/// order — what both engines multiply into the contracted block.
fn effective_coeffs(s: &Shell) -> Vec<f64> {
    (0..s.n_prim()).map(|i| s.primitive_coeff(i)).collect()
}

/// Rys path: accumulate the Coulomb engine over every primitive quartet, using the
/// caller-supplied effective coefficients (`e* [p] = d_p · N(α_p, l)`).
#[allow(clippy::too_many_arguments)]
fn contract_quartet_rys(
    sa: &Shell,
    ea: &[f64],
    sb: &Shell,
    eb: &[f64],
    sc: &Shell,
    ec: &[f64],
    sd: &Shell,
    ed: &[f64],
) -> Vec<f64> {
    let mut block = vec![0.0; sa.n_cart() * sb.n_cart() * sc.n_cart() * sd.n_cart()];
    for (pa, &ca) in ea.iter().enumerate() {
        for (pb, &cb) in eb.iter().enumerate() {
            for (pc, &cc) in ec.iter().enumerate() {
                for (pd, &cd) in ed.iter().enumerate() {
                    let scale = ca * cb * cc * cd;
                    rys::coulomb_into(
                        sa.prim(pa),
                        sb.prim(pb),
                        sc.prim(pc),
                        sd.prim(pd),
                        scale,
                        &mut block,
                    );
                }
            }
        }
    }
    block
}

/// OS/HGP path: one early-contraction call over the whole shell quartet, with the
/// caller-supplied effective coefficients.
#[allow(clippy::too_many_arguments)]
fn contract_quartet_oshgp(
    sa: &Shell,
    ea: &[f64],
    sb: &Shell,
    eb: &[f64],
    sc: &Shell,
    ec: &[f64],
    sd: &Shell,
    ed: &[f64],
) -> Vec<f64> {
    let mut block = vec![0.0; sa.n_cart() * sb.n_cart() * sc.n_cart() * sd.n_cart()];
    os_eri::coulomb_shell_into(
        ShellRef {
            center: sa.center(),
            l: sa.l(),
            exps: sa.exponents(),
            coeffs: ea,
        },
        ShellRef {
            center: sb.center(),
            l: sb.l(),
            exps: sb.exponents(),
            coeffs: eb,
        },
        ShellRef {
            center: sc.center(),
            l: sc.l(),
            exps: sc.exponents(),
            coeffs: ec,
        },
        ShellRef {
            center: sd.center(),
            l: sd.l(),
            exps: sd.exponents(),
            coeffs: ed,
        },
        &mut block,
    );
    block
}

/// Scatter a row-major quartet `block` (component dims `n = [na, nb, nc, nd]`) at
/// AO offsets `off = [oa, ob, oc, od]` into the dense `nao⁴` row-major tensor.
fn place_eri_block(out: &mut [f64], nao: usize, off: [usize; 4], n: [usize; 4], block: &[f64]) {
    for a in 0..n[0] {
        for b in 0..n[1] {
            for c in 0..n[2] {
                for d in 0..n[3] {
                    let src = ((a * n[1] + b) * n[2] + c) * n[3] + d;
                    let i = off[0] + a;
                    let j = off[1] + b;
                    let k = off[2] + c;
                    let l = off[3] + d;
                    out[((i * nao + j) * nao + k) * nao + l] = block[src];
                }
            }
        }
    }
}