#ifndef CEED_CUDA_REF_OPERATOR_ASSEMBLE_H
#define CEED_CUDA_REF_OPERATOR_ASSEMBLE_H
#include <ceed.h>
#if USE_CEEDSIZE
typedef CeedSize IndexType;
#else
typedef CeedInt IndexType;
#endif
extern "C" __launch_bounds__(BLOCK_SIZE) __global__
void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array,
CeedScalar *__restrict__ values_array) {
const int i = threadIdx.x; const int l = threadIdx.y;
const IndexType comp_out_stride = NUM_NODES * NUM_NODES;
const IndexType comp_in_stride = comp_out_stride * NUM_COMP;
const IndexType e_stride = comp_in_stride * NUM_COMP;
const IndexType q_e_stride = NUM_QPTS;
const IndexType q_comp_out_stride = NUM_ELEM * q_e_stride;
const IndexType q_e_mode_out_stride = q_comp_out_stride * NUM_COMP;
const IndexType q_comp_in_stride = q_e_mode_out_stride * NUM_E_MODE_OUT;
const IndexType q_e_mode_in_stride = q_comp_in_stride * NUM_COMP;
for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NUM_ELEM; e += gridDim.x * blockDim.z) {
for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) {
for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) {
CeedScalar result = 0.0;
IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e;
for (IndexType e_mode_in = 0; e_mode_in < NUM_E_MODE_IN; e_mode_in++) {
IndexType b_in_index = e_mode_in * NUM_QPTS * NUM_NODES;
for (IndexType e_mode_out = 0; e_mode_out < NUM_E_MODE_OUT; e_mode_out++) {
IndexType b_out_index = e_mode_out * NUM_QPTS * NUM_NODES;
IndexType qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in;
for (IndexType j = 0; j < NUM_QPTS; j++) {
result += B_out[b_out_index + j * NUM_NODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES + l];
}
} } IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NUM_NODES * i + l;
values_array[val_index] = result;
} } } }
extern "C" __launch_bounds__(BLOCK_SIZE) __global__
void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array,
CeedScalar *__restrict__ values_array) {
const int l = threadIdx.x;
const IndexType comp_out_stride = NUM_NODES * NUM_NODES;
const IndexType comp_in_stride = comp_out_stride * NUM_COMP;
const IndexType e_stride = comp_in_stride * NUM_COMP;
const IndexType q_e_stride = NUM_QPTS;
const IndexType q_comp_out_stride = NUM_ELEM * q_e_stride;
const IndexType q_e_mode_out_stride = q_comp_out_stride * NUM_COMP;
const IndexType q_comp_in_stride = q_e_mode_out_stride * NUM_E_MODE_OUT;
const IndexType q_e_mode_in_stride = q_comp_in_stride * NUM_COMP;
for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NUM_ELEM; e += gridDim.x * blockDim.z) {
for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) {
for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) {
for (IndexType i = 0; i < NUM_NODES; i++) {
CeedScalar result = 0.0;
IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e;
for (IndexType e_mode_in = 0; e_mode_in < NUM_E_MODE_IN; e_mode_in++) {
IndexType b_in_index = e_mode_in * NUM_QPTS * NUM_NODES;
for (IndexType e_mode_out = 0; e_mode_out < NUM_E_MODE_OUT; e_mode_out++) {
IndexType b_out_index = e_mode_out * NUM_QPTS * NUM_NODES;
IndexType qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in;
for (IndexType j = 0; j < NUM_QPTS; j++) {
result += B_out[b_out_index + j * NUM_NODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES + l];
}
} } IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NUM_NODES * i + l;
values_array[val_index] = result;
} } } } }
#endif