libcint 0.2.3 - Docs.rs

//! Implementation of integral at lower API level (crafting, col-major).

use crate::prelude::*;

/// Implementation of integral at lower API level (crafting, col-major).
impl CInt {
    #[doc(hidden)]
    pub fn integral_inplace<F>(
        &self,
        integrator: &dyn Integrator,
        out: &mut [F],
        shls_slice: &[[c_int; 2]],
        cint_opt: Option<&CIntOptimizer>,
        aosym: CIntSymm,
    ) -> Result<(), CIntError>
    where
        F: ComplexFloat + Send + Sync,
    {
        match aosym {
            CIntSymm::S1 => self.integral_s1_inplace(integrator, out, shls_slice, cint_opt),
            CIntSymm::S2ij => self.integral_s2ij_inplace(integrator, out, shls_slice, cint_opt),
            CIntSymm::S2kl => self.integral_s2kl_inplace(integrator, out, shls_slice, cint_opt),
            CIntSymm::S4 => self.integral_s4_inplace(integrator, out, shls_slice, cint_opt),
            CIntSymm::S8 => self.integral_s8_inplace(integrator, out, shls_slice, cint_opt),
        }
    }

    #[doc(hidden)]
    pub fn integral_s1_inplace<F>(
        &self,
        integrator: &dyn Integrator,
        out: &mut [F],
        shls_slice: &[[c_int; 2]],
        cint_opt: Option<&CIntOptimizer>,
    ) -> Result<(), CIntError>
    where
        F: ComplexFloat + Send + Sync,
    {
        /* #region sanity check and preparation */

        self.check_float_type::<F>()?;
        self.check_shls_slice(integrator, shls_slice, CIntSymm::S1)?;
        if let Some(cint_opt) = cint_opt {
            self.check_optimizer(integrator, cint_opt)?;
        }

        // dimensions

        let n_comp = match self.cint_type {
            Spheric | Cartesian => integrator.n_comp(),
            Spinor => integrator.n_spinor_comp(),
        }; // number of components for intor
        let n_center = integrator.n_center(); // atom center number for intor
        let cgto_shape = self.cgto_shape_s1(shls_slice); // AO shape, without intor component
        let cgto_locs = self.cgto_locs(shls_slice); // AO relative locations mapped to shells, 0-indexed

        // cache (thread local)
        let cache_size = self.max_cache_size(integrator, shls_slice);
        let buffer_size = self.max_buffer_size(integrator, shls_slice);
        let thread_init = || {
            let cache = unsafe { aligned_uninitialized_vec::<f64>(cache_size) };
            let buf = unsafe { aligned_uninitialized_vec::<F>(buffer_size) };
            (cache, buf)
        };

        /* #endregion */

        /* #region parallel integration generation */

        const I: usize = 0; // index of first shell
        const J: usize = 1; // index of second shell
        const K: usize = 2; // index of third shell
        const L: usize = 3; // index of fourth shell

        let nidx_i = (shls_slice[I][1] - shls_slice[I][0]) as usize;
        let nidx_j = (shls_slice[J][1] - shls_slice[J][0]) as usize;

        match n_center {
            2 => {
                let out_shape = [cgto_shape[I], cgto_shape[J], n_comp];

                let iter_layout = [nidx_i, nidx_j].f();
                let iter_indices = IndexedIterLayout::new(&iter_layout, ColMajor).unwrap();
                let iter_par = iter_indices.into_par_iter().with_min_len(RAYON_PAR_MIN);

                iter_par.for_each_init(thread_init, |(cache, buf), ([idx_i, idx_j], _)| {
                    // idx refers to the index of shell for iteration
                    // shl refers to the index of shell in the basis set (real shell index)
                    let shl_i = idx_i as c_int + shls_slice[I][0];
                    let shl_j = idx_j as c_int + shls_slice[J][0];
                    // cgto (ao basis) location for each shell
                    let cgto_loc_i = cgto_locs[I][idx_i];
                    let cgto_loc_j = cgto_locs[J][idx_j];
                    // number of cgto (ao basis) for each shell
                    let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
                    let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;

                    let shls = [shl_i, shl_j];

                    // call integral function
                    unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

                    // copy buffer to output slice
                    let out = unsafe { cast_mut_slice(&*out) };
                    let out_offsets = [cgto_loc_i, cgto_loc_j, 0];
                    let buf_shape = [cgto_i, cgto_j, n_comp];
                    copy_f_3d_s1(out, &out_offsets, &out_shape, buf, &buf_shape);
                });
            },
            3 => {
                let out_shape = [cgto_shape[I], cgto_shape[J], cgto_shape[K], n_comp];

                let nidx_k = (shls_slice[K][1] - shls_slice[K][0]) as usize;
                let iter_layout = [nidx_i, nidx_j, nidx_k].f();
                let iter_indices = IndexedIterLayout::new(&iter_layout, ColMajor).unwrap();
                let iter_par = iter_indices.into_par_iter().with_min_len(RAYON_PAR_MIN);

                iter_par.for_each_init(thread_init, |(cache, buf), ([idx_i, idx_j, idx_k], _)| {
                    let shl_i = idx_i as c_int + shls_slice[I][0];
                    let shl_j = idx_j as c_int + shls_slice[J][0];
                    let shl_k = idx_k as c_int + shls_slice[K][0];
                    let cgto_loc_i = cgto_locs[I][idx_i];
                    let cgto_loc_j = cgto_locs[J][idx_j];
                    let cgto_loc_k = cgto_locs[K][idx_k];
                    let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
                    let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;
                    let cgto_k = cgto_locs[K][idx_k + 1] - cgto_loc_k;

                    let shls = [shl_i, shl_j, shl_k];

                    // call integral function
                    unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

                    // copy buffer to output slice
                    let out = unsafe { cast_mut_slice(&*out) };
                    let out_offsets = [cgto_loc_i, cgto_loc_j, cgto_loc_k, 0];
                    let buf_shape = [cgto_i, cgto_j, cgto_k, n_comp];
                    copy_f_4d_s1(out, &out_offsets, &out_shape, buf, &buf_shape);
                });
            },
            4 => {
                let out_shape = [cgto_shape[I], cgto_shape[J], cgto_shape[K], cgto_shape[L], n_comp];

                let nidx_k = (shls_slice[K][1] - shls_slice[K][0]) as usize;
                let nidx_l = (shls_slice[L][1] - shls_slice[L][0]) as usize;
                let iter_layout = [nidx_i, nidx_j, nidx_k, nidx_l].f();
                let iter_indices = IndexedIterLayout::new(&iter_layout, ColMajor).unwrap();
                let iter_par = iter_indices.into_par_iter().with_min_len(RAYON_PAR_MIN);

                iter_par.for_each_init(thread_init, |(cache, buf), ([idx_i, idx_j, idx_k, idx_l], _)| {
                    let shl_i = idx_i as c_int + shls_slice[I][0];
                    let shl_j = idx_j as c_int + shls_slice[J][0];
                    let shl_k = idx_k as c_int + shls_slice[K][0];
                    let shl_l = idx_l as c_int + shls_slice[L][0];
                    let cgto_loc_i = cgto_locs[I][idx_i];
                    let cgto_loc_j = cgto_locs[J][idx_j];
                    let cgto_loc_k = cgto_locs[K][idx_k];
                    let cgto_loc_l = cgto_locs[L][idx_l];
                    let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
                    let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;
                    let cgto_k = cgto_locs[K][idx_k + 1] - cgto_loc_k;
                    let cgto_l = cgto_locs[L][idx_l + 1] - cgto_loc_l;

                    let shls = [shl_i, shl_j, shl_k, shl_l];

                    // call integral function
                    unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

                    // copy buffer to output slice
                    let out = unsafe { cast_mut_slice(&*out) };
                    let out_offsets = [cgto_loc_i, cgto_loc_j, cgto_loc_k, cgto_loc_l, 0];
                    let buf_shape = [cgto_i, cgto_j, cgto_k, cgto_l, n_comp];
                    copy_f_5d_s1(out, &out_offsets, &out_shape, buf, &buf_shape);
                });
            },
            _ => unreachable!(),
        }

        /* #endregion */

        Ok(())
    }

    #[doc(hidden)]
    #[allow(non_snake_case)]
    pub fn integral_s2ij_inplace<F>(
        &self,
        integrator: &dyn Integrator,
        out: &mut [F],
        shls_slice: &[[c_int; 2]],
        cint_opt: Option<&CIntOptimizer>,
    ) -> Result<(), CIntError>
    where
        F: ComplexFloat + Send + Sync,
    {
        /* #region sanity check and preparation */

        self.check_float_type::<F>()?;
        self.check_shls_slice(integrator, shls_slice, CIntSymm::S1)?;
        if let Some(cint_opt) = cint_opt {
            self.check_optimizer(integrator, cint_opt)?;
        }

        // dimensions

        let n_comp = match self.cint_type {
            Spheric | Cartesian => integrator.n_comp(),
            Spinor => integrator.n_spinor_comp(),
        }; // number of components for intor
        let n_center = integrator.n_center(); // atom center number for intor
        let cgto_shape = self.cgto_shape_s2ij(shls_slice); // AO shape, without intor component
        let cgto_locs = self.cgto_locs(shls_slice); // AO relative locations mapped to shells, 0-indexed

        // cache (thread local)
        let cache_size = self.max_cache_size(integrator, shls_slice);
        let buffer_size = self.max_buffer_size(integrator, shls_slice);
        let thread_init = || {
            let cache = unsafe { aligned_uninitialized_vec::<f64>(cache_size) };
            let buf = unsafe { aligned_uninitialized_vec::<F>(buffer_size) };
            (cache, buf)
        };

        /* #endregion */

        /* #region parallel integration generation */

        const I: usize = 0; // index of first shell
        const J: usize = 1; // index of second shell
        const K: usize = 2; // index of third shell
        const L: usize = 3; // index of fourth shell

        let nidx_i = (shls_slice[I][1] - shls_slice[I][0]) as usize;
        let nidx_ij = nidx_i * (nidx_i + 1) / 2;

        match n_center {
            2 => {
                let out_shape = [cgto_shape[0], n_comp];

                let iter_par = (0..nidx_ij).into_par_iter().with_min_len(RAYON_PAR_MIN);

                iter_par.for_each_init(thread_init, |(cache, buf), idx_ij| {
                    let [idx_i, idx_j] = unravel_s2_indices(idx_ij);

                    let shl_i = idx_i as c_int + shls_slice[I][0];
                    let shl_j = idx_j as c_int + shls_slice[J][0];
                    let cgto_loc_i = cgto_locs[I][idx_i];
                    let cgto_loc_j = cgto_locs[J][idx_j];
                    let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
                    let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;

                    let shls = [shl_i, shl_j];

                    // call integral function
                    unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

                    // copy buffer to output slice
                    let out = unsafe { cast_mut_slice(&*out) };
                    let out_offsets = [cgto_loc_i, cgto_loc_j, 0];
                    let buf_shape = [cgto_i, cgto_j, n_comp];
                    copy_f_3d_s2ij(out, &out_offsets, &out_shape, buf, &buf_shape);
                });
            },
            3 => {
                let out_shape = [cgto_shape[0], cgto_shape[1], n_comp];

                let nidx_k = (shls_slice[K][1] - shls_slice[K][0]) as usize;
                let iter_layout = [nidx_ij, nidx_k].f();
                let iter_indices = IndexedIterLayout::new(&iter_layout, ColMajor).unwrap();
                let iter_par = iter_indices.into_par_iter().with_min_len(RAYON_PAR_MIN);

                iter_par.for_each_init(thread_init, |(cache, buf), ([idx_ij, idx_k], _)| {
                    let [idx_i, idx_j] = unravel_s2_indices(idx_ij);

                    let shl_i = idx_i as c_int + shls_slice[I][0];
                    let shl_j = idx_j as c_int + shls_slice[J][0];
                    let shl_k = idx_k as c_int + shls_slice[K][0];
                    let cgto_loc_i = cgto_locs[I][idx_i];
                    let cgto_loc_j = cgto_locs[J][idx_j];
                    let cgto_loc_k = cgto_locs[K][idx_k];
                    let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
                    let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;
                    let cgto_k = cgto_locs[K][idx_k + 1] - cgto_loc_k;

                    let shls = [shl_i, shl_j, shl_k];

                    // call integral function
                    unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

                    // copy buffer to output slice
                    let out = unsafe { cast_mut_slice(&*out) };
                    let out_offsets = [cgto_loc_i, cgto_loc_j, cgto_loc_k, 0];
                    let buf_shape = [cgto_i, cgto_j, cgto_k, n_comp];
                    copy_f_4d_s2ij(out, &out_offsets, &out_shape, buf, &buf_shape);
                });
            },
            4 => {
                let out_shape = [cgto_shape[0], cgto_shape[1], cgto_shape[2], n_comp];

                let nidx_k = (shls_slice[K][1] - shls_slice[K][0]) as usize;
                let nidx_l = (shls_slice[L][1] - shls_slice[L][0]) as usize;
                let iter_layout = [nidx_ij, nidx_k, nidx_l].f();
                let iter_indices = IndexedIterLayout::new(&iter_layout, ColMajor).unwrap();
                let iter_par = iter_indices.into_par_iter().with_min_len(RAYON_PAR_MIN);

                iter_par.for_each_init(thread_init, |(cache, buf), ([idx_ij, idx_k, idx_l], _)| {
                    let [idx_i, idx_j] = unravel_s2_indices(idx_ij);

                    let shl_i = idx_i as c_int + shls_slice[I][0];
                    let shl_j = idx_j as c_int + shls_slice[J][0];
                    let shl_k = idx_k as c_int + shls_slice[K][0];
                    let shl_l = idx_l as c_int + shls_slice[L][0];
                    let cgto_loc_i = cgto_locs[I][idx_i];
                    let cgto_loc_j = cgto_locs[J][idx_j];
                    let cgto_loc_k = cgto_locs[K][idx_k];
                    let cgto_loc_l = cgto_locs[L][idx_l];
                    let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
                    let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;
                    let cgto_k = cgto_locs[K][idx_k + 1] - cgto_loc_k;
                    let cgto_l = cgto_locs[L][idx_l + 1] - cgto_loc_l;

                    let shls = [shl_i, shl_j, shl_k, shl_l];

                    // call integral function
                    unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

                    // copy buffer to output slice
                    let out = unsafe { cast_mut_slice(&*out) };
                    let out_offsets = [cgto_loc_i, cgto_loc_j, cgto_loc_k, cgto_loc_l, 0];
                    let buf_shape = [cgto_i, cgto_j, cgto_k, cgto_l, n_comp];
                    copy_f_5d_s2ij(out, &out_offsets, &out_shape, buf, &buf_shape);
                });
            },
            _ => unreachable!(),
        }

        /* #endregion */

        Ok(())
    }

    #[doc(hidden)]
    pub fn integral_s2kl_inplace<F>(
        &self,
        integrator: &dyn Integrator,
        out: &mut [F],
        shls_slice: &[[c_int; 2]],
        cint_opt: Option<&CIntOptimizer>,
    ) -> Result<(), CIntError>
    where
        F: ComplexFloat + Send + Sync,
    {
        /* #region sanity check and preparation */

        self.check_float_type::<F>()?;
        self.check_shls_slice(integrator, shls_slice, CIntSymm::S2kl)?;
        if let Some(cint_opt) = cint_opt {
            self.check_optimizer(integrator, cint_opt)?;
        }

        // dimensions

        let n_comp = match self.cint_type {
            Spheric | Cartesian => integrator.n_comp(),
            Spinor => integrator.n_spinor_comp(),
        }; // number of components for intor
           // n_center must be 4 for S2kl symmetry, checked in `check_shls_slice`
        let cgto_shape = self.cgto_shape_s2kl(shls_slice); // AO shape, without intor component
        let cgto_locs = self.cgto_locs(shls_slice); // AO relative locations mapped to shells, 0-indexed
        let out_shape = [cgto_shape[0], cgto_shape[1], cgto_shape[2], n_comp];

        // cache (thread local)
        let cache_size = self.max_cache_size(integrator, shls_slice);
        let buffer_size = self.max_buffer_size(integrator, shls_slice);
        let thread_init = || {
            let cache = unsafe { aligned_uninitialized_vec::<f64>(cache_size) };
            let buf = unsafe { aligned_uninitialized_vec::<F>(buffer_size) };
            (cache, buf)
        };

        /* #endregion */

        /* #region parallel integration generation */

        const I: usize = 0; // index of first shell
        const J: usize = 1; // index of second shell
        const K: usize = 2; // index of third shell
        const L: usize = 3; // index of fourth shell

        let nidx_i = (shls_slice[I][1] - shls_slice[I][0]) as usize;
        let nidx_j = (shls_slice[J][1] - shls_slice[J][0]) as usize;
        let nidx_k = (shls_slice[K][1] - shls_slice[K][0]) as usize;
        let nidx_kl = nidx_k * (nidx_k + 1) / 2;
        let iter_layout = [nidx_i, nidx_j, nidx_kl].f();
        let iter_indices = IndexedIterLayout::new(&iter_layout, ColMajor).unwrap();
        let iter_par = iter_indices.into_par_iter().with_min_len(RAYON_PAR_MIN);

        iter_par.for_each_init(thread_init, |(cache, buf), ([idx_i, idx_j, idx_kl], _)| {
            let [idx_k, idx_l] = unravel_s2_indices(idx_kl);

            let shl_i = idx_i as c_int + shls_slice[I][0];
            let shl_j = idx_j as c_int + shls_slice[J][0];
            let shl_k = idx_k as c_int + shls_slice[K][0];
            let shl_l = idx_l as c_int + shls_slice[L][0];
            let cgto_loc_i = cgto_locs[I][idx_i];
            let cgto_loc_j = cgto_locs[J][idx_j];
            let cgto_loc_k = cgto_locs[K][idx_k];
            let cgto_loc_l = cgto_locs[L][idx_l];
            let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
            let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;
            let cgto_k = cgto_locs[K][idx_k + 1] - cgto_loc_k;
            let cgto_l = cgto_locs[L][idx_l + 1] - cgto_loc_l;

            let shls = [shl_i, shl_j, shl_k, shl_l];

            // call integral function
            unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

            // copy buffer to output slice
            let out = unsafe { cast_mut_slice(&*out) };
            let out_offsets = [cgto_loc_i, cgto_loc_j, cgto_loc_k, cgto_loc_l, 0];
            let buf_shape = [cgto_i, cgto_j, cgto_k, cgto_l, n_comp];
            copy_f_5d_s2kl(out, &out_offsets, &out_shape, buf, &buf_shape);
        });

        /* #endregion */

        Ok(())
    }

    #[doc(hidden)]
    pub fn integral_s4_inplace<F>(
        &self,
        integrator: &dyn Integrator,
        out: &mut [F],
        shls_slice: &[[c_int; 2]],
        cint_opt: Option<&CIntOptimizer>,
    ) -> Result<(), CIntError>
    where
        F: ComplexFloat + Send + Sync,
    {
        /* #region sanity check and preparation */

        self.check_float_type::<F>()?;
        self.check_shls_slice(integrator, shls_slice, CIntSymm::S4)?;
        if let Some(cint_opt) = cint_opt {
            self.check_optimizer(integrator, cint_opt)?;
        }

        // dimensions

        let n_comp = match self.cint_type {
            Spheric | Cartesian => integrator.n_comp(),
            Spinor => integrator.n_spinor_comp(),
        }; // number of components for intor
           // n_center must be 4 for S4 symmetry, checked in `check_shls_slice`
        let cgto_shape = self.cgto_shape_s4(shls_slice); // AO shape, without intor component
        let cgto_locs = self.cgto_locs(shls_slice); // AO relative locations mapped to shells, 0-indexed
        let out_shape = [cgto_shape[0], cgto_shape[1], n_comp];

        // cache (thread local)
        let cache_size = self.max_cache_size(integrator, shls_slice);
        let buffer_size = self.max_buffer_size(integrator, shls_slice);
        let thread_init = || {
            let cache = unsafe { aligned_uninitialized_vec::<f64>(cache_size) };
            let buf = unsafe { aligned_uninitialized_vec::<F>(buffer_size) };
            (cache, buf)
        };

        /* #endregion */

        /* #region parallel integration generation */

        const I: usize = 0; // index of first shell
        const J: usize = 1; // index of second shell
        const K: usize = 2; // index of third shell
        const L: usize = 3; // index of fourth shell

        let nidx_i = (shls_slice[I][1] - shls_slice[I][0]) as usize;
        let nidx_ij = nidx_i * (nidx_i + 1) / 2;
        let nidx_k = (shls_slice[K][1] - shls_slice[K][0]) as usize;
        let nidx_kl = nidx_k * (nidx_k + 1) / 2;
        let iter_layout = [nidx_ij, nidx_kl].f();
        let iter_indices = IndexedIterLayout::new(&iter_layout, ColMajor).unwrap();
        let iter_par = iter_indices.into_par_iter().with_min_len(RAYON_PAR_MIN);

        iter_par.for_each_init(thread_init, |(cache, buf), ([idx_ij, idx_kl], _)| {
            let [idx_i, idx_j] = unravel_s2_indices(idx_ij);
            let [idx_k, idx_l] = unravel_s2_indices(idx_kl);

            let shl_i = idx_i as c_int + shls_slice[I][0];
            let shl_j = idx_j as c_int + shls_slice[J][0];
            let shl_k = idx_k as c_int + shls_slice[K][0];
            let shl_l = idx_l as c_int + shls_slice[L][0];
            let cgto_loc_i = cgto_locs[I][idx_i];
            let cgto_loc_j = cgto_locs[J][idx_j];
            let cgto_loc_k = cgto_locs[K][idx_k];
            let cgto_loc_l = cgto_locs[L][idx_l];
            let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
            let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;
            let cgto_k = cgto_locs[K][idx_k + 1] - cgto_loc_k;
            let cgto_l = cgto_locs[L][idx_l + 1] - cgto_loc_l;

            let shls = [shl_i, shl_j, shl_k, shl_l];

            // call integral function
            unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

            // copy buffer to output slice
            let out = unsafe { cast_mut_slice(&*out) };
            let out_offsets = [cgto_loc_i, cgto_loc_j, cgto_loc_k, cgto_loc_l, 0];
            let buf_shape = [cgto_i, cgto_j, cgto_k, cgto_l, n_comp];
            copy_f_5d_s4(out, &out_offsets, &out_shape, buf, &buf_shape);
        });

        /* #endregion */

        Ok(())
    }

    #[doc(hidden)]
    pub fn integral_s8_inplace<F>(
        &self,
        integrator: &dyn Integrator,
        out: &mut [F],
        shls_slice: &[[c_int; 2]],
        cint_opt: Option<&CIntOptimizer>,
    ) -> Result<(), CIntError>
    where
        F: ComplexFloat + Send + Sync,
    {
        /* #region sanity check and preparation */

        self.check_float_type::<F>()?;
        self.check_shls_slice(integrator, shls_slice, CIntSymm::S4)?;
        if let Some(cint_opt) = cint_opt {
            self.check_optimizer(integrator, cint_opt)?;
        }

        // dimensions

        let n_comp = match self.cint_type {
            Spheric | Cartesian => integrator.n_comp(),
            Spinor => integrator.n_spinor_comp(),
        }; // number of components for intor
           // n_center must be 4 for S8 symmetry, checked in `check_shls_slice`
        let cgto_shape = self.cgto_shape_s8(shls_slice); // AO shape, without intor component
        let cgto_locs = self.cgto_locs(shls_slice); // AO relative locations mapped to shells, 0-indexed
        let out_shape = [cgto_shape[0], n_comp];

        // cache (thread local)
        let cache_size = self.max_cache_size(integrator, shls_slice);
        let buffer_size = self.max_buffer_size(integrator, shls_slice);
        let thread_init = || {
            let cache = unsafe { aligned_uninitialized_vec::<f64>(cache_size) };
            let buf = unsafe { aligned_uninitialized_vec::<F>(buffer_size) };
            (cache, buf)
        };

        /* #endregion */

        /* #region parallel integration generation */

        const I: usize = 0; // index of first shell
        const J: usize = 1; // index of second shell
        const K: usize = 2; // index of third shell
        const L: usize = 3; // index of fourth shell

        // Following code will perform redundant iterations:
        // - l >= k
        // - l >= j >= i
        // where l >= k and j >= i are promised, but l >= j will be conditionally
        // skipped.
        let nidx_i = (shls_slice[I][1] - shls_slice[I][0]) as usize;
        let nidx_ij = nidx_i * (nidx_i + 1) / 2;
        let nidx_kl = nidx_ij;
        let iter_layout = [nidx_ij, nidx_kl].f();
        let iter_indices = IndexedIterLayout::new(&iter_layout, ColMajor).unwrap();
        let iter_par = iter_indices.into_par_iter().with_min_len(RAYON_PAR_MIN);

        iter_par.for_each_init(thread_init, |(cache, buf), ([idx_ij, idx_kl], _)| {
            let [idx_i, idx_j] = unravel_s2_indices(idx_ij);
            let [idx_k, idx_l] = unravel_s2_indices(idx_kl);
            if idx_l < idx_j {
                // skip redundant iteration
                return;
            }

            let shl_i = idx_i as c_int + shls_slice[I][0];
            let shl_j = idx_j as c_int + shls_slice[J][0];
            let shl_k = idx_k as c_int + shls_slice[K][0];
            let shl_l = idx_l as c_int + shls_slice[L][0];
            let cgto_loc_i = cgto_locs[I][idx_i];
            let cgto_loc_j = cgto_locs[J][idx_j];
            let cgto_loc_k = cgto_locs[K][idx_k];
            let cgto_loc_l = cgto_locs[L][idx_l];
            let cgto_i = cgto_locs[I][idx_i + 1] - cgto_loc_i;
            let cgto_j = cgto_locs[J][idx_j + 1] - cgto_loc_j;
            let cgto_k = cgto_locs[K][idx_k + 1] - cgto_loc_k;
            let cgto_l = cgto_locs[L][idx_l + 1] - cgto_loc_l;

            let shls = [shl_i, shl_j, shl_k, shl_l];

            // call integral function
            unsafe { self.integral_block(integrator, buf, &shls, &[], cint_opt, cache) };

            // copy buffer to output slice
            let out = unsafe { cast_mut_slice(&*out) };
            let out_offsets = [cgto_loc_i, cgto_loc_j, cgto_loc_k, cgto_loc_l, 0];
            let buf_shape = [cgto_i, cgto_j, cgto_k, cgto_l, n_comp];
            copy_f_5d_s8(out, &out_offsets, &out_shape, buf, &buf_shape);
        });

        /* #endregion */

        Ok(())
    }
}