#ifndef NVD_KERNEL
#define NVD_KERNEL
#include "nvd_device.h"
#include <fstream>
namespace ucl_cudadr {
class UCL_Texture;
template <class numtyp> class UCL_D_Vec;
template <class numtyp> class UCL_D_Mat;
template <class hosttype, class devtype> class UCL_Vector;
template <class hosttype, class devtype> class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256
class UCL_Program {
public:
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
inline UCL_Program(UCL_Device &device, const void *program,
const char *flags="", std::string *log=NULL) {
_cq=device.cq();
init(device);
load_string(program,flags,log);
}
inline ~UCL_Program() {}
inline void init(UCL_Device &device) { _cq=device.cq(); }
inline void clear() { }
inline int load(const char *filename, const char *flags="",
std::string *log=NULL) {
std::ifstream in(filename);
if (!in || in.is_open()==false) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
return UCL_FILE_NOT_FOUND;
}
std::string program((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>());
in.close();
return load_string(program.c_str(),flags,log);
}
inline int load_string(const void *program, const char *flags="",
std::string *log=NULL) {
if (std::string(flags)=="BINARY")
return load_binary((const char *)program);
const unsigned int num_opts=2;
CUjit_option options[num_opts];
void *values[num_opts];
options[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
values[0] = (void *)(int)10240;
options[1] = CU_JIT_INFO_LOG_BUFFER;
char clog[10240];
values[1] = clog;
CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
options,(void **)values);
if (log!=NULL)
*log=std::string(clog);
if (err != CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " UCL Error: Error compiling PTX Program...\n"
<< "----------------------------------------------------------\n";
std::cerr << log << std::endl;
#endif
return UCL_COMPILE_ERROR;
}
return UCL_SUCCESS;
}
inline int load_binary(const char *filename) {
CUmodule _module;
CUresult err = cuModuleLoad(&_module,filename);
if (err==301) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open binary kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
return UCL_FILE_NOT_FOUND;
} else if (err!=CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Error loading binary kernel file: "
<< filename << std::endl;
UCL_GERYON_EXIT;
#endif
return UCL_FILE_NOT_FOUND;
}
return UCL_SUCCESS;
}
friend class UCL_Kernel;
private:
CUmodule _module;
CUstream _cq;
friend class UCL_Texture;
};
class UCL_Kernel {
public:
UCL_Kernel() : _dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000
_param_size=0;
#endif
_num_blocks[0]=0;
}
UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000
_param_size=0;
#endif
_num_blocks[0]=0;
set_function(program,function);
_cq=program._cq;
}
~UCL_Kernel() {}
inline void clear() { }
inline int set_function(UCL_Program &program, const char *function) {
CUresult err=cuModuleGetFunction(&_kernel,program._module,function);
if (err!=CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not find function: " << function
<< " in program.\n";
UCL_GERYON_EXIT;
#endif
return UCL_FUNCTION_NOT_FOUND;
}
_cq=program._cq;
return UCL_SUCCESS;
}
template <class dtype>
inline void set_arg(const unsigned index, const dtype * const arg) {
if (index==_num_args)
add_arg(arg);
else if (index<_num_args)
#if CUDA_VERSION >= 4000
_kernel_args[index]=arg;
#else
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
#endif
else
assert(0==1); }
template <class numtyp>
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
{ set_arg(&arg->begin()); }
template <class numtyp>
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
{ set_arg(&arg->begin()); }
template <class hosttype, class devtype>
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
template <class hosttype, class devtype>
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
inline void add_arg(const CUdeviceptr* const arg) {
#if CUDA_VERSION >= 4000
_kernel_args[_num_args]=(void *)arg;
#else
void* ptr = (void*)(size_t)(*arg);
_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
_offsets.push_back(_param_size);
_param_size+=sizeof(ptr);
#endif
_num_args++;
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
}
template <class dtype>
inline void add_arg(const dtype* const arg) {
#if CUDA_VERSION >= 4000
_kernel_args[_num_args]=const_cast<dtype * const>(arg);
#else
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
_offsets.push_back(_param_size);
_param_size+=sizeof(dtype);
#endif
_num_args++;
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
}
template <class numtyp>
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
{ add_arg(&arg->begin()); }
template <class numtyp>
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
{ add_arg(&arg->begin()); }
template <class hosttype, class devtype>
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
template <class hosttype, class devtype>
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1;
_num_blocks[0]=num_blocks;
_num_blocks[1]=1;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size;
_block_size[1]=1;
_block_size[2]=1;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
#endif
}
inline void set_size(const size_t num_blocks, const size_t block_size,
command_queue &cq)
{ _cq=cq; set_size(num_blocks,block_size); }
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=1;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
#endif
}
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=block_size_z;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
block_size_z));
#endif
}
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) {
_cq=cq;
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z);
}
inline void run() {
#if CUDA_VERSION >= 4000
CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
_num_blocks[2],_block_size[0],_block_size[1],
_block_size[2],0,_cq,_kernel_args,NULL));
#else
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
#endif
}
inline void clear_args() {
_num_args=0;
#if CUDA_VERSION < 4000
_offsets.clear();
_param_size=0;
#endif
}
inline command_queue & cq() { return _cq; }
inline void cq(command_queue &cq_in) { _cq=cq_in; }
#include "ucl_arg_kludge.h"
private:
CUfunction _kernel;
CUstream _cq;
unsigned _dimensions;
unsigned _num_blocks[3];
unsigned _num_args;
friend class UCL_Texture;
#if CUDA_VERSION >= 4000
unsigned _block_size[3];
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
#else
std::vector<unsigned> _offsets;
unsigned _param_size;
#endif
};
}
#endif