libceed-sys 0.12.0

// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED:  http://github.com/ceed

#include "../../kernel-defines.hpp"

const char *occa_tensor_basis_3d_gpu_source = STRINGIFY_SOURCE(

    typedef CeedScalar * dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
    typedef const CeedScalar *const_dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);

    typedef CeedScalar * quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3);
    typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3);

    typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, BASIS_COMPONENT_COUNT); typedef const CeedScalar *quadToDof @dim(P1D, Q1D);
    typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, Q1D, elementCount);

    //---[ Utility Methods ]----------------
    inline void add(const CeedScalar *U, CeedScalar *V) {
      for (int q = 0; q < Q1D; q++) {
        V[q] += U[q];
      }
    }

    inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) {
      // Zero out extra entries
      for (int pz = 0; pz < P1D; ++pz) {
        Up[pz] = ((px < P1D) && (py < P1D) ? U(px, py, pz, component, element) : 0.0);
      }
      for (int q = P1D; q < Q1D; ++q) {
        Up[q] = 0.0;
      }
    }

    inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar *Vp, dofArray V) {
      if ((px < P1D) && (py < P1D)) {
        for (int pz = 0; pz < P1D; ++pz) {
          V(px, py, pz, component, element) = Vp[pz];
        }
      }
    }

    inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
                          const_quadArray U, CeedScalar *Uq) {
      for (int qz = 0; qz < Q1D; ++qz) {
        Uq[qz] = U(qx, qy, qz, element, component, dim);
      }
    }

    inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
                           const CeedScalar *Vq, quadArray V) {
      for (int qz = 0; qz < Q1D; ++qz) {
        V(qx, qy, qz, element, component, dim) = Vq[qz];
      }
    }

    inline void contractX(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq,
                          CeedScalar *Vp) {
      for (int pz = 0; pz < P1D; ++pz) {
        sharedBuffer(qx, qy, component) = Uq[pz];
        Vp[pz]                          = 0.0;
        @barrier();
        for (int p = 0; p < P1D; ++p) {
          Vp[pz] += B(p, qx) * sharedBuffer(p, qy, component);
        }
        @barrier();
      }
    }

    inline void contractY(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq,
                          CeedScalar *Vp) {
      for (int pz = 0; pz < P1D; ++pz) {
        sharedBuffer(qx, qy, component) = Uq[pz];
        Vp[pz]                          = 0.0;
        @barrier();
        for (int p = 0; p < P1D; ++p) {
          Vp[pz] += B(p, qy) * sharedBuffer(qx, p, component);
        }
        @barrier();
      }
    }

    inline void contractZ(const int qx, const int qy, quadToDof B, const CeedScalar *Up, CeedScalar *Vq) {
      for (int qz = 0; qz < Q1D; ++qz) {
        Vq[qz] = 0.0;
        for (int p = 0; p < P1D; ++p) {
          Vq[qz] += B(p, qz) * Up[p];
        }
      }
    }

    inline void contractTransposeX(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up,
                                   CeedScalar *Vp) {
      for (int pz = 0; pz < P1D; ++pz) {
        sharedBuffer(px, py, component) = Up[pz];
        Vp[pz]                          = 0.0;
        @barrier();
        if (px < P1D) {
          for (int qx = 0; qx < Q1D; ++qx) {
            Vp[pz] += B(px, qx) * sharedBuffer(qx, py, component);
          }
        }
        @barrier();
      }
    }

    inline void contractTransposeY(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up,
                                   CeedScalar *Vp) {
      for (int pz = 0; pz < P1D; ++pz) {
        sharedBuffer(px, py, component) = Up[pz];
        Vp[pz]                          = 0.0;
        @barrier();
        if (py < P1D) {
          for (int qy = 0; qy < Q1D; ++qy) {
            Vp[pz] += B(py, qy) * sharedBuffer(px, qy, component);
          }
        }
        @barrier();
      }
    }

    inline void contractTransposeZ(const int px, const int py, quadToDof B, const CeedScalar *Uq, CeedScalar *Vq) {
      for (int pz = 0; pz < P1D; ++pz) {
        Vq[pz] = 0.0;
        for (int qz = 0; qz < Q1D; ++qz) {
          Vq[pz] += B(pz, qz) * Uq[qz];
        }
      }
    }

    //---[ Kernels ]------------------------
    @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) {
      for (int element = 0; element < elementCount; ++element; @outer) {
        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT];

        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
          for (int qy = 0; qy < Q1D; ++qy; @inner) {
            for (int qx = 0; qx < Q1D; ++qx; @inner) {
              if (element < elementCount) {
                CeedScalar r1[MAX_PQ], r2[MAX_PQ];
                for (int q = 0; q < Q1D; ++q) {
                  r1[q] = 0.0;
                  r2[q] = 0.0;
                }

                if (!TRANSPOSE) {
                  readDofs(element, component, qx, qy, U, r1);
                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
                  contractY(qx, qy, component, sharedBuffer, B, r2, r1);
                  contractZ(qx, qy, B, r1, r2);
                  writeQuads(elementCount, element, component, qx, qy, 0, r2, V);
                } else {
                  readQuads(elementCount, element, component, qx, qy, 0, U, r1);
                  contractTransposeZ(qx, qy, B, r1, r2);
                  contractTransposeY(qx, qy, component, sharedBuffer, B, r2, r1);
                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r2);
                  writeDofs(element, component, qx, qy, r2, V);
                }
              }
            }
          }
        }
      }
    }

    @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) {
      for (int element = 0; element < elementCount; ++element; @outer) {
        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT];

        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
          for (int qy = 0; qy < Q1D; ++qy; @inner) {
            for (int qx = 0; qx < Q1D; ++qx; @inner) {
              if (element < elementCount) {
                CeedScalar r1[MAX_PQ], r2[MAX_PQ], r3[MAX_PQ];

                if (!TRANSPOSE) {
                  readDofs(element, component, qx, qy, U, r1);
                  // Dx
                  contractX(qx, qy, component, sharedBuffer, Bx, r1, r2);
                  contractY(qx, qy, component, sharedBuffer, B, r2, r3);
                  contractZ(qx, qy, B, r3, r2);
                  writeQuads(elementCount, element, component, qx, qy, 0, r2, V);
                  // Dy
                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
                  contractY(qx, qy, component, sharedBuffer, Bx, r2, r3);
                  contractZ(qx, qy, B, r3, r2);
                  writeQuads(elementCount, element, component, qx, qy, 1, r2, V);
                  // Dz
                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
                  contractY(qx, qy, component, sharedBuffer, B, r2, r3);
                  contractZ(qx, qy, Bx, r3, r2);
                  writeQuads(elementCount, element, component, qx, qy, 2, r2, V);
                } else {
                  // Dx
                  readQuads(elementCount, element, component, qx, qy, 0, U, r1);
                  contractTransposeZ(qx, qy, B, r1, r3);
                  contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1);
                  contractTransposeX(qx, qy, component, sharedBuffer, Bx, r1, r2);
                  // Dy
                  readQuads(elementCount, element, component, qx, qy, 1, U, r1);
                  contractTransposeZ(qx, qy, B, r1, r3);
                  contractTransposeY(qx, qy, component, sharedBuffer, Bx, r3, r1);
                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3);
                  add(r3, r2);
                  // Dz
                  readQuads(elementCount, element, component, qx, qy, 2, U, r1);
                  contractTransposeZ(qx, qy, Bx, r1, r3);
                  contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1);
                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3);
                  add(r3, r2);
                  writeDofs(element, component, qx, qy, r2, V);
                }
              }
            }
          }
        }
      }
    }

    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) {
      for (int element = 0; element < elementCount; ++element; @outer) {
        for (int qz = 0; qz < Q1D; ++qz; @inner) {
          for (int qy = 0; qy < Q1D; ++qy; @inner) {
            for (int qx = 0; qx < Q1D; ++qx) {
              if (element < elementCount) {
                W(qx, qy, qz, element) = qWeights1D[qx] * qWeights1D[qy] * qWeights1D[qz];
              }
            }
          }
        }
      }
    }

);