integral 0.1.2 - Docs.rs

//! One- and two-electron integral builders over a [`Basis`].
//!
//! Each one-electron builder loops over shell pairs, contracts primitives with
//! their effective coefficients (`d_i · N(α_i, l)`), and places the resulting
//! Cartesian block into a dense `nao × nao` matrix (row-major). Dipole returns
//! three such matrices for the `x`, `y`, `z` components about a chosen origin.
//!
//! The two-electron builder ([`Basis::eri`]) produces the dense repulsion
//! tensor by evaluating each canonical shell quartet once (engine-dispatched)
//! and scattering it to all permutation-equivalent slots; see its docs for the
//! layout.

use integral_core::os_eri::{self, ShellRef};
use integral_core::{os, rys};

use crate::shell::{Basis, Shell};
use crate::spherical::{shell_transform, transform_block};

/// Transform a contracted Cartesian one-electron block (`na_cart × nb_cart`,
/// row-major) into the function-space block (`na_func × nb_func`) by applying
/// each shell's `c2s` transform (identity for Cartesian shells).
pub(crate) fn to_func_1e(block: Vec<f64>, sa: &Shell, sb: &Shell) -> Vec<f64> {
    let mats = [shell_transform(sa), shell_transform(sb)];
    transform_block(
        block,
        &[sa.n_cart(), sb.n_cart()],
        &[mats[0].as_deref(), mats[1].as_deref()],
    )
}

/// Transform a contracted Cartesian ERI quartet block into function space by
/// applying each of the four shells' `c2s` transforms, computed on the spot.
/// The dense `O(n⁴)` builders precompute the transforms once per shell and call
/// [`to_func_eri_cached`] instead (building a `c2s` matrix costs far more than
/// applying it).
pub(crate) fn to_func_eri(
    block: Vec<f64>,
    sa: &Shell,
    sb: &Shell,
    sc: &Shell,
    sd: &Shell,
) -> Vec<f64> {
    let mats = [
        shell_transform(sa),
        shell_transform(sb),
        shell_transform(sc),
        shell_transform(sd),
    ];
    to_func_eri_cached(
        block,
        [sa, sb, sc, sd],
        [
            mats[0].as_deref(),
            mats[1].as_deref(),
            mats[2].as_deref(),
            mats[3].as_deref(),
        ],
    )
}

/// Like [`to_func_eri`] but with each shell's transform supplied by the caller
/// (`None` = Cartesian shell, identity). Results are identical.
pub(crate) fn to_func_eri_cached(
    block: Vec<f64>,
    s: [&Shell; 4],
    mats: [Option<&[f64]>; 4],
) -> Vec<f64> {
    transform_block(
        block,
        &[s[0].n_cart(), s[1].n_cart(), s[2].n_cart(), s[3].n_cart()],
        &mats,
    )
}

/// Which two-electron engine evaluates an ERI quartet.
///
/// Correctness is **engine-transparent**: both engines compute the same Coulomb
/// integral to the documented tolerance, so [`Engine::OsHgp`] and [`Engine::Rys`]
/// can be forced (e.g. from tests/CI) to exercise both paths on the same cases.
/// [`Engine::Auto`] applies the dispatch policy ([`select_engine`]).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum Engine {
    /// Dispatch by `(total angular momentum, contraction degree)`.
    #[default]
    Auto,
    /// Force the Obara–Saika / Head-Gordon–Pople engine (low-L, high-contraction).
    OsHgp,
    /// Force the Rys-quadrature engine (general; the high-L fallback).
    Rys,
}

/// Dispatch policy: pick OS/HGP vs Rys from a quartet's total angular momentum
/// `l_total = la+lb+lc+ld` and its primitive-quartet `contraction_degree`
/// (`n_prim_a·n_prim_b·n_prim_c·n_prim_d`).
///
/// HGP's per-primitive VRR is cheaper than Rys's per-primitive roots/weights and
/// its geometry-only HRR is amortised once per shell quartet, so it wins at low L
/// and/or high contraction; Rys's small-footprint 2D recurrences win as L grows
/// and the HGP VRR/HRR tables blow up.
///
/// The crossover thresholds are calibrated from an on-host benchmark measured
/// with the current Rys-roots interpolation and the OS/HGP engine with
/// monomorphized small-class VRR kernels for
/// `(la+lb, lc+ld) ≤ 3`. The crossover sits markedly toward OS/HGP: it wins
/// down to `deg 1` for `l_total ≤ 5`, and once `deg ≥ 16` all the way up to
/// `l_total 16` —
///
/// | `l_total` | OS/HGP when `contraction_degree ≥` |
/// |-----------|-----------------------------------|
/// | 0‥=5      | 1   (always OS)                   |
/// | 6‥=16     | 16  (≈ K ≥ 2 per shell)           |
/// | ≥ 17      | never (Rys)                       |
///
/// Each band is backed by measured medians: `l_total 0–5` — L0/L1 deg-1 OS,
/// (dp|pp)/(ds|dp) deg-1 OS 1.2–1.4× (the dominant lt-5 shapes run the
/// monomorphized VRR); `l_total 6–16` — (dd|pp)/(dd|dp)/(dd|dd) deg-1 Rys
/// 1.4–2.8× (their bra/ket degree 4 exceeds the monomorphization), deg-81 OS
/// 1.7–2.7×; `l_total ≥ 17` — Rys (the high-contraction corner there is
/// unmeasured, so the cap stays). The thresholds are **calibrated to this engine
/// state**; a future change to either engine's constant re-opens the calibration.
#[must_use]
pub fn select_engine(l_total: usize, contraction_degree: usize) -> Engine {
    let threshold = match l_total {
        0..=5 => 1,              // OS wins down to deg 1 (small-class VRR covers these)
        6..=16 => 16,            // OS once contraction clears deg 16 (K≥2/shell); Rys below
        _ => return Engine::Rys, // l_total ≥ 17: Rys (no measured OS win at high deg)
    };
    if contraction_degree >= threshold {
        Engine::OsHgp
    } else {
        Engine::Rys
    }
}

/// Place a row-major `na × nb` block at `(row_off, col_off)` in a row-major
/// `n × n` matrix.
pub(crate) fn place_block(
    mat: &mut [f64],
    n: usize,
    row_off: usize,
    col_off: usize,
    block: &[f64],
    nb: usize,
) {
    let na = block.len() / nb;
    for i in 0..na {
        for j in 0..nb {
            mat[(row_off + i) * n + col_off + j] = block[i * nb + j];
        }
    }
}

/// Contract one shell pair into a fresh `na × nb` block using `prim_op`, which
/// accumulates `scale · ⟨a|·|b⟩` for a primitive pair into the block.
fn contract_pair<F>(sa: &Shell, sb: &Shell, mut prim_op: F) -> Vec<f64>
where
    F: FnMut(os::Prim, os::Prim, f64, &mut [f64]),
{
    let mut block = vec![0.0; sa.n_cart() * sb.n_cart()];
    for pi in 0..sa.n_prim() {
        for pj in 0..sb.n_prim() {
            let scale = sa.primitive_coeff(pi) * sb.primitive_coeff(pj);
            prim_op(sa.prim(pi), sb.prim(pj), scale, &mut block);
        }
    }
    block
}

impl Basis {
    /// Overlap matrix `S_{μν} = ⟨μ|ν⟩`.
    #[must_use]
    pub fn overlap(&self) -> Vec<f64> {
        let n = self.nao();
        let offs = self.offsets();
        let mut mat = vec![0.0; n * n];
        for (si, sa) in self.shells().iter().enumerate() {
            for (sj, sb) in self.shells().iter().enumerate() {
                let block = to_func_1e(contract_pair(sa, sb, os::overlap_into), sa, sb);
                place_block(&mut mat, n, offs[si], offs[sj], &block, sb.n_func());
            }
        }
        mat
    }

    /// Kinetic-energy matrix `T_{μν} = ⟨μ| -½∇² |ν⟩`.
    #[must_use]
    pub fn kinetic(&self) -> Vec<f64> {
        let n = self.nao();
        let offs = self.offsets();
        let mut mat = vec![0.0; n * n];
        for (si, sa) in self.shells().iter().enumerate() {
            for (sj, sb) in self.shells().iter().enumerate() {
                let block = to_func_1e(contract_pair(sa, sb, os::kinetic_into), sa, sb);
                place_block(&mut mat, n, offs[si], offs[sj], &block, sb.n_func());
            }
        }
        mat
    }

    /// Nuclear-attraction matrix `V_{μν} = Σ_C ⟨μ| −Z_C/|r−C| |ν⟩` for the given
    /// point charges `charges = [(center, Z)]`.
    #[must_use]
    pub fn nuclear(&self, charges: &[([f64; 3], f64)]) -> Vec<f64> {
        let n = self.nao();
        let offs = self.offsets();
        let mut mat = vec![0.0; n * n];
        for (si, sa) in self.shells().iter().enumerate() {
            for (sj, sb) in self.shells().iter().enumerate() {
                let block = to_func_1e(
                    contract_pair(sa, sb, |a, b, scale, out| {
                        os::nuclear_into(a, b, charges, scale, out);
                    }),
                    sa,
                    sb,
                );
                place_block(&mut mat, n, offs[si], offs[sj], &block, sb.n_func());
            }
        }
        mat
    }

    /// Cartesian dipole matrices `[D_x, D_y, D_z]`, `D_k = ⟨μ| (r−O)_k |ν⟩`,
    /// about the origin `o`.
    #[must_use]
    pub fn dipole(&self, o: [f64; 3]) -> [Vec<f64>; 3] {
        let n = self.nao();
        let offs = self.offsets();
        let mut dx = vec![0.0; n * n];
        let mut dy = vec![0.0; n * n];
        let mut dz = vec![0.0; n * n];
        for (si, sa) in self.shells().iter().enumerate() {
            for (sj, sb) in self.shells().iter().enumerate() {
                let (na, nb) = (sa.n_cart(), sb.n_cart());
                let (mut bx, mut by, mut bz) =
                    (vec![0.0; na * nb], vec![0.0; na * nb], vec![0.0; na * nb]);
                for pi in 0..sa.n_prim() {
                    for pj in 0..sb.n_prim() {
                        let scale = sa.primitive_coeff(pi) * sb.primitive_coeff(pj);
                        os::dipole_into(
                            sa.prim(pi),
                            sb.prim(pj),
                            o,
                            scale,
                            &mut bx,
                            &mut by,
                            &mut bz,
                        );
                    }
                }
                let bx = to_func_1e(bx, sa, sb);
                let by = to_func_1e(by, sa, sb);
                let bz = to_func_1e(bz, sa, sb);
                let nbf = sb.n_func();
                place_block(&mut dx, n, offs[si], offs[sj], &bx, nbf);
                place_block(&mut dy, n, offs[si], offs[sj], &by, nbf);
                place_block(&mut dz, n, offs[si], offs[sj], &bz, nbf);
            }
        }
        [dx, dy, dz]
    }

    /// Contracted Cartesian ERI block for the four shells `(i, j, k, l)` in
    /// chemists' notation `(ij|kl) = ∫∫ φ_i(1)φ_j(1) r₁₂⁻¹ φ_k(2)φ_l(2) d1 d2`.
    ///
    /// The returned block is **row-major over the four Cartesian component
    /// indices** `(a, b, c, d)` of shells `(i, j, k, l)`:
    ///
    /// ```text
    ///   block[((a · n_j + b) · n_k + c) · n_l + d]
    /// ```
    ///
    /// with `n_x = self.shells()[x].n_func()` (`n_cart` for a Cartesian shell,
    /// `2l+1` for a spherical one) and the Cartesian component order of
    /// `integral_math::am` (or the `integral_math::solid_harmonics::m_order` spherical
    /// order for spherical shells) — the
    /// fastest-varying index is `d`, slowest is `a`. The block length is
    /// `n_i · n_j · n_k · n_l`.
    #[must_use]
    pub fn eri_block(&self, i: usize, j: usize, k: usize, l: usize) -> Vec<f64> {
        self.eri_block_with(Engine::Auto, i, j, k, l)
    }

    /// Like [`Basis::eri_block`] but forces a specific [`Engine`] (or [`Engine::Auto`]
    /// for the dispatch policy). Both engines produce the same block to tolerance;
    /// forcing exists so tests/CI exercise each path on the same quartets.
    #[must_use]
    pub fn eri_block_with(
        &self,
        engine: Engine,
        i: usize,
        j: usize,
        k: usize,
        l: usize,
    ) -> Vec<f64> {
        let s = self.shells();
        let (sa, sb, sc, sd) = (&s[i], &s[j], &s[k], &s[l]);
        let block = contract_quartet(engine, sa, sb, sc, sd);
        // Spherical shells are transformed to their `2l+1` components; Cartesian
        // shells pass through unchanged. Block dims become per-shell `n_func`.
        to_func_eri(block, sa, sb, sc, sd)
    }

    /// Dense electron-repulsion tensor `(ij|kl)` over the whole basis, in
    /// chemists' notation. Shells declared [`crate::ShellKind::Spherical`]
    /// contribute their `2l+1` spherical components; Cartesian shells their
    /// `n_cart`.
    ///
    /// Shape `[nao, nao, nao, nao]` flattened **row-major**:
    ///
    /// ```text
    ///   eri[((i · nao + j) · nao + k) · nao + l] = (ij|kl)
    /// ```
    ///
    /// where `nao = self.nao()` and `i, j, k, l` are global AO indices (shell
    /// blocks placed at the offsets from `offsets()`). The tensor obeys the
    /// 8-fold permutational symmetry `(ij|kl) = (ji|kl) = (ij|lk) = (kl|ij) = …`.
    ///
    /// The build exploits that symmetry: only the canonical shell quartets
    /// (`i ≥ j`, `k ≥ l`, pair index `ij ≥ kl`) are evaluated, and each computed
    /// block is scattered to every distinct permutation-equivalent position. Slots
    /// related by a *shell-level* permutation are therefore bitwise-equal copies of
    /// one evaluation; within a block whose bra (or ket) shells coincide, the usual
    /// round-off-level (`~1e-16` relative) asymmetry of one kernel evaluation
    /// remains, exactly as for an unsymmetrized build.
    #[must_use]
    pub fn eri(&self) -> Vec<f64> {
        self.eri_with(Engine::Auto)
    }

    /// Like [`Basis::eri`] but forces a specific [`Engine`] (or [`Engine::Auto`]).
    /// Both engines produce the same tensor to tolerance.
    #[must_use]
    pub fn eri_with(&self, engine: Engine) -> Vec<f64> {
        let nao = self.nao();
        let offs = self.offsets();
        let shells = self.shells();
        // Effective coefficients depend only on the shell; compute once per shell
        // instead of re-running `cart_norm` (a `powf`) for all four shells of every
        // quartet. Same for the `c2s` transforms: building one runs the
        // Racah-coefficient/normalization machinery, so rebuilding them per quartet
        // dominated the spherical driver (~35% of a cc-pVDZ build).
        let eff: Vec<Vec<f64>> = shells.iter().map(effective_coeffs).collect();
        let c2s: Vec<Option<Vec<f64>>> = shells.iter().map(shell_transform).collect();
        let mut out = vec![0.0; nao * nao * nao * nao];
        // Canonical s8 loop: i ≥ j, k ≥ l, pair index (ij) ≥ (kl). Each computed
        // block is scattered to all distinct permutation-equivalent slots, so the
        // kernel runs once per *unique* quartet — ~8× fewer evaluations than the
        // full shell loop on large bases.
        for (si, sa) in shells.iter().enumerate() {
            for (sj, sb) in shells.iter().enumerate().take(si + 1) {
                for (sk, sc) in shells.iter().enumerate().take(si + 1) {
                    let l_top = if sk == si { sj } else { sk };
                    for (sl, sd) in shells.iter().enumerate().take(l_top + 1) {
                        let block = to_func_eri_cached(
                            contract_quartet_cached(
                                engine, sa, &eff[si], sb, &eff[sj], sc, &eff[sk], sd, &eff[sl],
                            ),
                            [sa, sb, sc, sd],
                            [
                                c2s[si].as_deref(),
                                c2s[sj].as_deref(),
                                c2s[sk].as_deref(),
                                c2s[sl].as_deref(),
                            ],
                        );
                        scatter_eri_block_s8(
                            &mut out,
                            nao,
                            [si, sj, sk, sl],
                            &offs,
                            [sa.n_func(), sb.n_func(), sc.n_func(), sd.n_func()],
                            &block,
                        );
                    }
                }
            }
        }
        out
    }

    /// Cauchy–Schwarz shell-pair bound matrix `Q` (Häser–Ahlrichs 1989), row-major
    /// `n_shells × n_shells`:
    ///
    /// ```text
    ///   Q[i, j] = sqrt( max_{μ∈i, ν∈j} (μν|μν) ).
    /// ```
    ///
    /// Each diagonal self-repulsion `(μν|μν) ≥ 0` is read from the `(ij|ij)` shell
    /// block, so `Q` bounds every ERI by `|(μν|λσ)| ≤ Q[i,j]·Q[k,l]` for `μν` in
    /// shell pair `(i,j)` and `λσ` in `(k,l)`. Kind-aware: spherical shells use
    /// their `2l+1` components, so `Q` bounds the spherical integrals directly.
    #[must_use]
    pub fn schwarz_bounds(&self) -> Vec<f64> {
        self.schwarz_bounds_with(Engine::Auto)
    }

    /// Like [`Basis::schwarz_bounds`] but with a forced [`Engine`] (the diagonal
    /// blocks are evaluated with it). The bound is engine-independent to tolerance.
    #[must_use]
    pub fn schwarz_bounds_with(&self, engine: Engine) -> Vec<f64> {
        let shells = self.shells();
        let nsh = shells.len();
        let mut q = vec![0.0; nsh * nsh];
        for i in 0..nsh {
            for j in 0..nsh {
                let (ni, nj) = (shells[i].n_func(), shells[j].n_func());
                let block = self.eri_block_with(engine, i, j, i, j);
                let mut mx = 0.0_f64;
                for mu in 0..ni {
                    for nu in 0..nj {
                        // Diagonal element (μν|μν) of the (ij|ij) block.
                        let idx = ((mu * nj + nu) * ni + mu) * nj + nu;
                        mx = mx.max(block[idx].abs());
                    }
                }
                q[i * nsh + j] = mx.sqrt();
            }
        }
        q
    }

    /// Schwarz-screened dense ERI tensor: identical to [`Basis::eri`] except a
    /// shell quartet `(ij|kl)` is **skipped** (left zero) when its Cauchy–Schwarz
    /// bound `Q[i,j]·Q[k,l] < τ` (`tau`). Because every element of a skipped block
    /// satisfies `|(μν|λσ)| ≤ Q[i,j]·Q[k,l] < τ`, screening introduces **no error
    /// above `τ`**. Returns the tensor and [`ScreeningStats`].
    ///
    /// `tau` is the documented screening threshold; smaller `τ` retains more
    /// quartets (more accurate, slower). A typical production value is `1e-10`–
    /// `1e-12`.
    #[must_use]
    pub fn eri_screened(&self, tau: f64) -> (Vec<f64>, ScreeningStats) {
        self.eri_screened_with(Engine::Auto, tau)
    }

    /// Like [`Basis::eri_screened`] but with a forced [`Engine`].
    #[must_use]
    pub fn eri_screened_with(&self, engine: Engine, tau: f64) -> (Vec<f64>, ScreeningStats) {
        let nao = self.nao();
        let offs = self.offsets();
        let shells = self.shells();
        let nsh = shells.len();
        let q = self.schwarz_bounds_with(engine);
        let eff: Vec<Vec<f64>> = shells.iter().map(effective_coeffs).collect();
        let c2s: Vec<Option<Vec<f64>>> = shells.iter().map(shell_transform).collect();
        let mut out = vec![0.0; nao * nao * nao * nao];
        let mut total = 0_usize;
        let mut skipped = 0_usize;
        // Same canonical s8 loop as `eri_with`; a skipped canonical quartet leaves
        // all 8 permutation-equivalent slots zero (the Schwarz bound is
        // permutation-invariant, so the guarantee `|(μν|λσ)| < τ` covers them all).
        for si in 0..nsh {
            for sj in 0..=si {
                let qij = q[si * nsh + sj];
                for sk in 0..=si {
                    let l_top = if sk == si { sj } else { sk };
                    for sl in 0..=l_top {
                        total += 1;
                        if qij * q[sk * nsh + sl] < tau {
                            skipped += 1;
                            continue;
                        }
                        let block = to_func_eri_cached(
                            contract_quartet_cached(
                                engine,
                                &shells[si],
                                &eff[si],
                                &shells[sj],
                                &eff[sj],
                                &shells[sk],
                                &eff[sk],
                                &shells[sl],
                                &eff[sl],
                            ),
                            [&shells[si], &shells[sj], &shells[sk], &shells[sl]],
                            [
                                c2s[si].as_deref(),
                                c2s[sj].as_deref(),
                                c2s[sk].as_deref(),
                                c2s[sl].as_deref(),
                            ],
                        );
                        scatter_eri_block_s8(
                            &mut out,
                            nao,
                            [si, sj, sk, sl],
                            &offs,
                            [
                                shells[si].n_func(),
                                shells[sj].n_func(),
                                shells[sk].n_func(),
                                shells[sl].n_func(),
                            ],
                            &block,
                        );
                    }
                }
            }
        }
        (
            out,
            ScreeningStats {
                shell_quartets_total: total,
                shell_quartets_skipped: skipped,
                tau,
            },
        )
    }
}

/// Outcome of a Schwarz-screened ERI build ([`Basis::eri_screened`]).
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct ScreeningStats {
    /// Total *canonical* shell quartets considered (`i ≥ j`, `k ≥ l`, `ij ≥ kl`
    /// — the unique quartets under the 8-fold permutational symmetry).
    pub shell_quartets_total: usize,
    /// Canonical shell quartets skipped by the Schwarz bound.
    pub shell_quartets_skipped: usize,
    /// The screening threshold `τ` used.
    pub tau: f64,
}

impl ScreeningStats {
    /// Fraction of shell quartets skipped, in `[0, 1]`.
    #[must_use]
    pub fn skipped_fraction(&self) -> f64 {
        if self.shell_quartets_total == 0 {
            0.0
        } else {
            self.shell_quartets_skipped as f64 / self.shell_quartets_total as f64
        }
    }
}

/// Contract one shell quartet into a fresh `na·nb·nc·nd` block (row-major over
/// the four Cartesian component indices), dispatching to the requested engine.
/// [`Engine::Auto`] resolves via [`select_engine`]. Computes each shell's effective
/// coefficients on the spot; the dense `O(nao⁴)` builders precompute them once per
/// shell and call [`contract_quartet_cached`] instead.
fn contract_quartet(engine: Engine, sa: &Shell, sb: &Shell, sc: &Shell, sd: &Shell) -> Vec<f64> {
    contract_quartet_cached(
        engine,
        sa,
        &effective_coeffs(sa),
        sb,
        &effective_coeffs(sb),
        sc,
        &effective_coeffs(sc),
        sd,
        &effective_coeffs(sd),
    )
}

/// Like [`contract_quartet`] but with each shell's effective coefficients supplied
/// by the caller (e.g. precomputed once per shell across a dense build), avoiding
/// the per-quartet `cart_norm`/allocation. Results are identical.
#[allow(clippy::too_many_arguments)]
pub(crate) fn contract_quartet_cached(
    engine: Engine,
    sa: &Shell,
    ea: &[f64],
    sb: &Shell,
    eb: &[f64],
    sc: &Shell,
    ec: &[f64],
    sd: &Shell,
    ed: &[f64],
) -> Vec<f64> {
    let resolved = match engine {
        Engine::Auto => select_engine(
            sa.l() + sb.l() + sc.l() + sd.l(),
            sa.n_prim() * sb.n_prim() * sc.n_prim() * sd.n_prim(),
        ),
        forced => forced,
    };
    match resolved {
        Engine::OsHgp => contract_quartet_oshgp(sa, ea, sb, eb, sc, ec, sd, ed),
        // `Auto` is resolved above; treat anything else as Rys.
        _ => contract_quartet_rys(sa, ea, sb, eb, sc, ec, sd, ed),
    }
}

/// Effective contraction coefficients `d_i · N(α_i, l)` of a shell, in primitive
/// order — what both engines multiply into the contracted block.
pub(crate) fn effective_coeffs(s: &Shell) -> Vec<f64> {
    (0..s.n_prim()).map(|i| s.primitive_coeff(i)).collect()
}

/// Rys path: accumulate the Coulomb engine over every primitive quartet, using the
/// caller-supplied effective coefficients (`e* [p] = d_p · N(α_p, l)`).
#[allow(clippy::too_many_arguments)]
fn contract_quartet_rys(
    sa: &Shell,
    ea: &[f64],
    sb: &Shell,
    eb: &[f64],
    sc: &Shell,
    ec: &[f64],
    sd: &Shell,
    ed: &[f64],
) -> Vec<f64> {
    let mut block = vec![0.0; sa.n_cart() * sb.n_cart() * sc.n_cart() * sd.n_cart()];
    for (pa, &ca) in ea.iter().enumerate() {
        for (pb, &cb) in eb.iter().enumerate() {
            for (pc, &cc) in ec.iter().enumerate() {
                for (pd, &cd) in ed.iter().enumerate() {
                    let scale = ca * cb * cc * cd;
                    rys::coulomb_into(
                        sa.prim(pa),
                        sb.prim(pb),
                        sc.prim(pc),
                        sd.prim(pd),
                        scale,
                        &mut block,
                    );
                }
            }
        }
    }
    block
}

/// OS/HGP path: one early-contraction call over the whole shell quartet, with the
/// caller-supplied effective coefficients.
#[allow(clippy::too_many_arguments)]
fn contract_quartet_oshgp(
    sa: &Shell,
    ea: &[f64],
    sb: &Shell,
    eb: &[f64],
    sc: &Shell,
    ec: &[f64],
    sd: &Shell,
    ed: &[f64],
) -> Vec<f64> {
    let mut block = vec![0.0; sa.n_cart() * sb.n_cart() * sc.n_cart() * sd.n_cart()];
    os_eri::coulomb_shell_into(
        ShellRef {
            center: sa.center(),
            l: sa.l(),
            exps: sa.exponents(),
            coeffs: ea,
        },
        ShellRef {
            center: sb.center(),
            l: sb.l(),
            exps: sb.exponents(),
            coeffs: eb,
        },
        ShellRef {
            center: sc.center(),
            l: sc.l(),
            exps: sc.exponents(),
            coeffs: ec,
        },
        ShellRef {
            center: sd.center(),
            l: sd.l(),
            exps: sd.exponents(),
            coeffs: ed,
        },
        &mut block,
    );
    block
}

/// The 8 index permutations of chemists' `(ij|kl)`: bra swap, ket swap, and
/// bra↔ket exchange. `PERMS8[p][q]` is the source axis (`0..4` ⇒ `a,b,c,d`)
/// that lands at output position `q`.
///
/// The single source of truth for the ERI permutational symmetry. The first
/// four rows (`PERMS8[..4]`) are exactly the **bra/ket-internal** permutations
/// (no bra↔ket exchange); the parallel [`crate::EriBuilder`] reuses that half to
/// scatter a canonical bra-pair's writes (see `eri_builder::scatter_4fold`).
pub(crate) const PERMS8: [[usize; 4]; 8] = [
    [0, 1, 2, 3], // (ij|kl)
    [1, 0, 2, 3], // (ji|kl)
    [0, 1, 3, 2], // (ij|lk)
    [1, 0, 3, 2], // (ji|lk)
    [2, 3, 0, 1], // (kl|ij)
    [2, 3, 1, 0], // (lk|ij)
    [3, 2, 0, 1], // (kl|ji)
    [3, 2, 1, 0], // (lk|ji)
];

/// The canonical shell pairs `(i, j)` with `i ≥ j` — the unique pairs under the
/// bra (and ket) index-swap symmetry, in row-major order (`i` outer, `j` inner).
///
/// This is the same `i ≥ j` enumeration the dense [`Basis::eri`] driver runs in
/// its outer two loops; the parallel [`crate::EriBuilder`] reuses it for both the
/// bra grain and the ket sweep so the canonical-pair definition lives in one place.
pub(crate) fn canonical_shell_pairs(nsh: usize) -> Vec<(usize, usize)> {
    let mut pairs = Vec::with_capacity(nsh * (nsh + 1) / 2);
    for i in 0..nsh {
        for j in 0..=i {
            pairs.push((i, j));
        }
    }
    pairs
}

/// Scatter one computed canonical quartet block into every *distinct*
/// permutation-equivalent position of the dense `nao⁴` row-major tensor.
///
/// `sidx` are the four shell indices, `offs` the per-shell AO offsets, and
/// `n = [na, nb, nc, nd]` the block's component dims (row-major over `a,b,c,d`).
/// Two permutations write the same slot set iff they map the shell-index tuple
/// to the same tuple, so deduplicating on the permuted tuple makes every output
/// slot written exactly once across a canonical build (identity first, so a
/// quartet's own slots always carry its directly computed values).
fn scatter_eri_block_s8(
    out: &mut [f64],
    nao: usize,
    sidx: [usize; 4],
    offs: &[usize],
    n: [usize; 4],
    block: &[f64],
) {
    let mut seen: [[usize; 4]; 8] = [[usize::MAX; 4]; 8];
    let mut n_seen = 0;
    for perm in &PERMS8 {
        let tup = [sidx[perm[0]], sidx[perm[1]], sidx[perm[2]], sidx[perm[3]]];
        if seen[..n_seen].contains(&tup) {
            continue;
        }
        seen[n_seen] = tup;
        n_seen += 1;
        // Output strides/base per *source* axis: source axis `perm[q]` lands at
        // output position `q`, whose stride is `nao^(3−q)`.
        let mut stride = [0usize; 4];
        let mut base = 0usize;
        for (q, &src_axis) in perm.iter().enumerate() {
            let s = nao.pow(3 - q as u32);
            stride[src_axis] = s;
            base += offs[sidx[src_axis]] * s;
        }
        let mut src = 0usize;
        for a in 0..n[0] {
            let oa = base + a * stride[0];
            for b in 0..n[1] {
                let ob = oa + b * stride[1];
                for c in 0..n[2] {
                    let oc = ob + c * stride[2];
                    for d in 0..n[3] {
                        out[oc + d * stride[3]] = block[src];
                        src += 1;
                    }
                }
            }
        }
    }
}