#ifndef CEED_HIP_REF_OPERATOR_ASSEMBLE_H
#define CEED_HIP_REF_OPERATOR_ASSEMBLE_H
#include <ceed.h>
#if CEEDSIZE
typedef CeedSize IndexType;
#else
typedef CeedInt IndexType;
#endif
extern "C" __launch_bounds__(BLOCK_SIZE) __global__
void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array,
CeedScalar *__restrict__ values_array) {
const int i = threadIdx.x; const int l = threadIdx.y;
const IndexType comp_out_stride = NNODES * NNODES;
const IndexType comp_in_stride = comp_out_stride * NCOMP;
const IndexType e_stride = comp_in_stride * NCOMP;
const IndexType qe_stride = NQPTS;
const IndexType qcomp_out_stride = NELEM * qe_stride;
const IndexType qemode_out_stride = qcomp_out_stride * NCOMP;
const IndexType qcomp_in_stride = qemode_out_stride * NUMEMODEOUT;
const IndexType qemode_in_stride = qcomp_in_stride * NCOMP;
for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) {
for (IndexType comp_in = 0; comp_in < NCOMP; comp_in++) {
for (IndexType comp_out = 0; comp_out < NCOMP; comp_out++) {
CeedScalar result = 0.0;
IndexType qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
for (IndexType emode_in = 0; emode_in < NUMEMODEIN; emode_in++) {
IndexType b_in_index = emode_in * NQPTS * NNODES;
for (IndexType emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) {
IndexType b_out_index = emode_out * NQPTS * NNODES;
IndexType qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
for (IndexType j = 0; j < NQPTS; j++) {
result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l];
}
} } IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l;
values_array[val_index] = result;
} } } }
extern "C" __launch_bounds__(BLOCK_SIZE) __global__
void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array,
CeedScalar *__restrict__ values_array) {
const int l = threadIdx.x;
const IndexType comp_out_stride = NNODES * NNODES;
const IndexType comp_in_stride = comp_out_stride * NCOMP;
const IndexType e_stride = comp_in_stride * NCOMP;
const IndexType qe_stride = NQPTS;
const IndexType qcomp_out_stride = NELEM * qe_stride;
const IndexType qemode_out_stride = qcomp_out_stride * NCOMP;
const IndexType qcomp_in_stride = qemode_out_stride * NUMEMODEOUT;
const IndexType qemode_in_stride = qcomp_in_stride * NCOMP;
for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) {
for (IndexType comp_in = 0; comp_in < NCOMP; comp_in++) {
for (IndexType comp_out = 0; comp_out < NCOMP; comp_out++) {
for (IndexType i = 0; i < NNODES; i++) {
CeedScalar result = 0.0;
IndexType qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
for (IndexType emode_in = 0; emode_in < NUMEMODEIN; emode_in++) {
IndexType b_in_index = emode_in * NQPTS * NNODES;
for (IndexType emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) {
IndexType b_out_index = emode_out * NQPTS * NNODES;
IndexType qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
for (IndexType j = 0; j < NQPTS; j++) {
result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l];
}
} } IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l;
values_array[val_index] = result;
} } } } }
#endif