#ifndef CEED_CUDA_REF_BASIS_NONTENSOR_H
#define CEED_CUDA_REF_BASIS_NONTENSOR_H
#include <ceed.h>
extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U,
CeedScalar *__restrict__ d_V) {
const CeedInt t_id = threadIdx.x;
const CeedScalar *U;
CeedScalar V;
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
if (transpose) { U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q;
V = 0.0;
for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i];
d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V;
} else { U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P;
V = 0.0;
for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i];
d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V;
}
}
}
}
extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U,
CeedScalar *__restrict__ d_V) {
const CeedInt t_id = threadIdx.x;
const CeedScalar *U;
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
if (transpose) { CeedScalar V = 0.0;
for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q;
for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i];
}
d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V;
} else { CeedScalar V[BASIS_DIM];
U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P;
for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0;
for (CeedInt i = 0; i < BASIS_P; i++) {
const CeedScalar val = U[i];
for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val;
}
for (CeedInt dim = 0; dim < BASIS_DIM; dim++) {
d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim];
}
}
}
}
}
extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) {
const CeedInt t_id = threadIdx.x;
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
}
}
#endif