lammps-sys 0.6.0

/***************************************************************************
                                ocl_kernel.h
                             -------------------
                               W. Michael Brown

  Utilities for dealing with OpenCL kernels

 __________________________________________________________________________
    This file is part of the Geryon Unified Coprocessor Library (UCL)
 __________________________________________________________________________

    begin                : Sun Feb 7 2010
    copyright            : (C) 2010 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/

/* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

#ifndef OCL_KERNEL
#define OCL_KERNEL

#include "ocl_device.h"
#include <fstream>

namespace ucl_opencl {

class UCL_Texture;
template <class numtyp> class UCL_D_Vec;
template <class numtyp> class UCL_D_Mat;
template <class hosttype, class devtype> class UCL_Vector;
template <class hosttype, class devtype> class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256

/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
 public:
  inline UCL_Program() : _init_done(false) {}
  inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
  inline UCL_Program(UCL_Device &device, const void *program,
                     const char *flags="", std::string *log=NULL) :
      _init_done(false) {
    init(device);
    load_string(program,flags,log);
  }

  inline ~UCL_Program() { clear(); }

  /// Initialize the program with a device
  inline void init(UCL_Device &device) {
    clear();
    _device=device.cl_device();
    _context=device.context();
    _cq=device.cq();
    CL_SAFE_CALL(clRetainContext(_context));
    CL_SAFE_CALL(clRetainCommandQueue(_cq));
    _init_done=true;
  }

  /// Clear any data associated with program
  /** \note Must call init() after each clear **/
  inline void clear() {
    if (_init_done) {
      CL_DESTRUCT_CALL(clReleaseProgram(_program));
      CL_DESTRUCT_CALL(clReleaseContext(_context));
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
      _init_done=false;
    }
  }

  /// Load a program from a file and compile with flags
  inline int load(const char *filename, const char *flags="",
                  std::string *log=NULL) {
    std::ifstream in(filename);
    if (!in || in.is_open()==false) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not open kernel file: "
                << filename << std::endl;
      UCL_GERYON_EXIT;
      #endif
      return UCL_FILE_NOT_FOUND;
    }

    std::string program((std::istreambuf_iterator<char>(in)),
                        std::istreambuf_iterator<char>());
    in.close();
    return load_string(program.c_str(),flags,log);
  }

  /// Load a program from a string and compile with flags
  inline int load_string(const void *program, const char *flags="",
                         std::string *log=NULL) {
    cl_int error_flag;
    const char *prog=(const char *)program;
    _program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag);
    CL_CHECK_ERR(error_flag);
    error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL);
    if (error_flag!=-11)
      CL_CHECK_ERR(error_flag);
    cl_build_status build_status;
    CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
                                       CL_PROGRAM_BUILD_STATUS,
                                       sizeof(cl_build_status),&build_status,
                                       NULL));

    if (build_status != CL_SUCCESS || log!=NULL) {
      size_t ms;
      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
                                         NULL, &ms));
      char *build_log = new char[ms];
      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
                                         build_log, NULL));

      if (log!=NULL)
        *log=std::string(build_log);

      if (build_status != CL_SUCCESS) {
        #ifndef UCL_NO_EXIT
        std::cerr << std::endl
                  << "----------------------------------------------------------\n"
                  << " UCL Error: Error compiling OpenCL Program ("
                  << build_status << ") ...\n"
                  << "----------------------------------------------------------\n";
        std::cerr << build_log << std::endl;
        #endif
        delete[] build_log;
        return UCL_COMPILE_ERROR;
      } else delete[] build_log;
    }

    return UCL_SUCCESS;
  }

  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return _cq; }
  /// Change the default command queue associated with matrix
  inline void cq(command_queue &cq_in) { _cq=cq_in; }

  friend class UCL_Kernel;
 private:
  bool _init_done;
  cl_program _program;
  cl_device_id _device;
  cl_context _context;
  cl_command_queue _cq;
};

/// Class for dealing with OpenCL kernels
class UCL_Kernel {
 public:
  UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
    {  _block_size[0]=0; _num_blocks[0]=0; }

  inline UCL_Kernel(UCL_Program &program, const char *function) :
    _dimensions(1), _function_set(false), _num_args(0)
    {  _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }

  inline ~UCL_Kernel() { clear(); }

  /// Clear any function associated with the kernel
  inline void clear() {
    if (_function_set) {
      clReleaseKernel(_kernel);
      clReleaseProgram(_program);
      clReleaseCommandQueue(_cq);
      _function_set=false;
    }
  }

  /// Get the kernel function from a program
  /** \return UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
  inline int set_function(UCL_Program &program, const char *function);

  /// Set the kernel argument.
  /** If not a device pointer, this must be repeated each time the argument
    * changes **/
  template <class dtype>
  inline void set_arg(const cl_uint index, const dtype * const arg) {
    CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
    if (index>_num_args) {
      _num_args=index;
      #ifdef UCL_DEBUG
      if (_num_args>_kernel_info_nargs) {
        std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
                  << _kernel_info_name << std::endl;
        assert(0==1);
      }
      #endif
    }
  }

  /// Set a geryon container as a kernel argument.
  template <class numtyp>
  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
    { set_arg(&arg->begin()); }

  /// Set a geryon container as a kernel argument.
  template <class numtyp>
  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
    { set_arg(&arg->begin()); }

  /// Set a geryon container as a kernel argument.
  template <class hosttype, class devtype>
  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
    { set_arg(&arg->device.begin()); }

  /// Set a geryon container as a kernel argument.
  template <class hosttype, class devtype>
  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
    { set_arg(&arg->device.begin()); }

  /// Add a kernel argument.
  template <class dtype>
  inline void add_arg(const dtype * const arg) {
    CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
    _num_args++;
    #ifdef UCL_DEBUG
    if (_num_args>_kernel_info_nargs) {
      std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
                << _kernel_info_name << std::endl;
      assert(0==1);
    }
    #endif
  }

  /// Add a geryon container as a kernel argument.
  template <class numtyp>
  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
    { add_arg(&arg->begin()); }

  /// Add a geryon container as a kernel argument.
  template <class numtyp>
  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
    { add_arg(&arg->begin()); }

  /// Add a geryon container as a kernel argument.
  template <class hosttype, class devtype>
  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
    { add_arg(&arg->device.begin()); }

  /// Add a geryon container as a kernel argument.
  template <class hosttype, class devtype>
  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
    { add_arg(&arg->device.begin()); }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks, const size_t block_size) {
    _dimensions=1;
    _num_blocks[0]=num_blocks*block_size;
    _block_size[0]=block_size;
  }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue for the kernel is changed to cq **/
  inline void set_size(const size_t num_blocks, const size_t block_size,
                       command_queue &cq)
    { _cq=cq; set_size(num_blocks,block_size); }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y) {
    _dimensions=2;
    _num_blocks[0]=num_blocks_x*block_size_x;
    _block_size[0]=block_size_x;
    _num_blocks[1]=num_blocks_y*block_size_y;
    _block_size[1]=block_size_y;
  }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue for the kernel is changed to cq **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y,
                       command_queue &cq)
    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x,
                       const size_t block_size_y, const size_t block_size_z) {
    _dimensions=3;
    const size_t num_blocks_z=1;
    _num_blocks[0]=num_blocks_x*block_size_x;
    _block_size[0]=block_size_x;
    _num_blocks[1]=num_blocks_y*block_size_y;
    _block_size[1]=block_size_y;
    _num_blocks[2]=num_blocks_z*block_size_z;
    _block_size[2]=block_size_z;
  }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y,
                       const size_t block_size_z, command_queue &cq) {
    _cq=cq;
    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
             block_size_z);
  }

  /// Run the kernel in the default command queue
  inline void run();

  /// Clear any arguments associated with the kernel
  inline void clear_args() { _num_args=0; }

  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return _cq; }
  /// Change the default command queue associated with matrix
  inline void cq(command_queue &cq_in) { _cq=cq_in; }
  #include "ucl_arg_kludge.h"

 private:
  cl_kernel _kernel;
  cl_program _program;
  cl_uint _dimensions;
  size_t _block_size[3];
  size_t _num_blocks[3];
  bool _function_set;

  cl_command_queue _cq;        // The default command queue for this kernel
  unsigned _num_args;

  #ifdef UCL_DEBUG
  std::string _kernel_info_name;
  unsigned _kernel_info_nargs;
  //std::string _kernel_info_args[256];
  #endif
};

inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
  clear();
  _function_set=true;
  _cq=program._cq;
  CL_SAFE_CALL(clRetainCommandQueue(_cq));
  _program=program._program;
  CL_SAFE_CALL(clRetainProgram(_program));
  cl_int error_flag;
  _kernel=clCreateKernel(program._program,function,&error_flag);

  if (error_flag!=CL_SUCCESS) {
    #ifndef UCL_NO_EXIT
    std::cerr << "UCL Error: Could not find function: " << function
              << " in program.\n";
    UCL_GERYON_EXIT;
    #endif
    return UCL_FUNCTION_NOT_FOUND;
  }

  #ifdef UCL_DEBUG
  _kernel_info_name=function;
  cl_uint nargs;
  CL_SAFE_CALL(clGetKernelInfo(_kernel,CL_KERNEL_NUM_ARGS,sizeof(cl_uint),
                               &nargs,NULL));
  _kernel_info_nargs=nargs;
  #ifdef NOT_TEST_CL_VERSION_1_2
  char tname[256];
  size_t ret;
  for (cl_uint i=0; i<nargs; i++) {
    CL_SAFE_CALL(clGetKernelArgInfo(_kernel,i,CL_KERNEL_ARG_TYPE_NAME,256,
                                    tname,&ret));
    _kernel_info_args[i]=tname;
  }
  #endif
  #endif

  return UCL_SUCCESS;
}

void UCL_Kernel::run() {
  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
                                      _num_blocks,_block_size,0,NULL,NULL));
}

} // namespace

#endif