#ifndef PAIR_GPU_ATOM_H
#define PAIR_GPU_ATOM_H
#include <cmath>
#include "mpi.h"
#if defined(USE_OPENCL)
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
using namespace ucl_opencl;
#elif defined(USE_CUDART)
#include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h"
#include "geryon/nvc_kernel.h"
using namespace ucl_cudart;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
using namespace ucl_cudadr;
#endif
#ifdef USE_CUDPP
#include "cudpp.h"
#endif
#include "lal_precision.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Atom {
public:
Atom();
~Atom() { clear(); }
inline int max_atoms() const { return _max_atoms; }
inline int nall() const { return _nall; }
inline void nall(const int n) { _nall=n; }
int bytes_per_atom() const;
bool init(const int nall, const bool charge, const bool rot,
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
const bool vel=false);
inline bool resize(const int nall, bool &success) {
_nall=nall;
if (nall>_max_atoms) {
clear_resize();
success = success && alloc(nall);
_resized=true;
}
return _resized;
}
bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
const bool bonds, const bool vel=false);
bool charge() { return _charge; }
bool quaternion() { return _rot; }
bool velocity() { return _vel; }
void clear_resize();
void clear();
double host_memory_usage() const;
void sort_neighbor(const int num_atoms);
inline void acc_timers() {
time_pos.add_to_total();
if (_charge)
time_q.add_to_total();
if (_rot)
time_quat.add_to_total();
if (_vel)
time_vel.add_to_total();
}
inline void zero_timers() {
time_pos.zero();
if (_charge)
time_q.zero();
if (_rot)
time_quat.zero();
if (_vel)
time_vel.zero();
}
inline double transfer_time() {
double total=time_pos.total_seconds();
time_pos.zero_total();
if (_charge) {
total+=time_q.total_seconds();
time_q.zero_total();
}
if (_rot) {
total+=time_quat.total_seconds();
time_quat.zero_total();
}
if (_vel) {
total+=time_vel.total_seconds();
time_vel.zero_total();
}
return total+_time_transfer/1000.0;
}
inline double cast_time()
{ double t=_time_cast; _time_cast=0.0; return t; }
template <class dev_typ, class t1>
inline void type_pack1(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii]=static_cast<numtyp>(one[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
template <class dev_typ, class t1, class t2>
inline void type_pack2(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*2]=static_cast<numtyp>(one[i][j]);
buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
template <class dev_typ, class t1, class t2, class t3>
inline void type_pack4(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two, t3 **three) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*4]=static_cast<numtyp>(one[i][j]);
buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
template <class dev_typ, class t1, class t2, class t3, class t4>
inline void type_pack4(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two, t3 **three, t4 **four) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*4]=static_cast<numtyp>(one[i][j]);
buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
template <class dev_typ, class t1, class t2>
inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
for (int i=0; i<n; i++) {
buffer[i*2]=static_cast<numtyp>(one[i][i]);
buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),n,*dev);
ucl_copy(dev_v,view,false);
}
inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
inline void cast_x_data(double **host_ptr, const int *host_type) {
if (_x_avail==false) {
double t=MPI_Wtime();
#ifdef GPU_CAST
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
#else
int wl=0;
for (int i=0; i<_nall; i++) {
x[wl]=host_ptr[i][0];
x[wl+1]=host_ptr[i][1];
x[wl+2]=host_ptr[i][2];
x[wl+3]=host_type[i];
wl+=4;
}
#endif
_time_cast+=MPI_Wtime()-t;
}
}
inline void add_x_data(double **host_ptr, int *host_type) {
time_pos.start();
if (_x_avail==false) {
#ifdef GPU_CAST
x_cast.update_device(_nall*3,true);
type_cast.update_device(_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&x, &x_cast, &type_cast, &_nall);
#else
x.update_device(_nall*4,true);
#endif
_x_avail=true;
}
time_pos.stop();
}
inline void cast_copy_x(double **host_ptr, int *host_type) {
cast_x_data(host_ptr,host_type);
add_x_data(host_ptr,host_type);
}
template<class cpytyp>
inline void cast_q_data(cpytyp *host_ptr) {
if (_q_avail==false) {
double t=MPI_Wtime();
if (_host_view) {
q.host.view((numtyp*)host_ptr,_nall,*dev);
q.device.view(q.host);
} else if (sizeof(numtyp)==sizeof(double))
memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
else
for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t;
}
}
inline void add_q_data() {
time_q.start();
if (_q_avail==false) {
q.update_device(_nall,true);
_q_avail=true;
}
time_q.stop();
}
template<class cpytyp>
inline void cast_quat_data(cpytyp *host_ptr) {
if (_quat_avail==false) {
double t=MPI_Wtime();
if (_host_view) {
quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
quat.device.view(quat.host);
} else if (sizeof(numtyp)==sizeof(double))
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
else
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t;
}
}
inline void add_quat_data() {
time_quat.start();
if (_quat_avail==false) {
quat.update_device(_nall*4,true);
_quat_avail=true;
}
time_quat.stop();
}
inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
if (_v_avail==false) {
double t=MPI_Wtime();
#ifdef GPU_CAST
memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
#else
int wl=0;
for (int i=0; i<_nall; i++) {
v[wl]=host_ptr[i][0];
v[wl+1]=host_ptr[i][1];
v[wl+2]=host_ptr[i][2];
v[wl+3]=host_tag[i];
wl+=4;
}
#endif
_time_cast+=MPI_Wtime()-t;
}
}
inline void add_v_data(double **host_ptr, tagint *host_tag) {
time_vel.start();
if (_v_avail==false) {
#ifdef GPU_CAST
v_cast.update_device(_nall*3,true);
tag_cast.update_device(_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
#else
v.update_device(_nall*4,true);
#endif
_v_avail=true;
}
time_vel.stop();
}
inline void cast_copy_v(double **host_ptr, tagint *host_tag) {
cast_v_data(host_ptr,host_tag);
add_v_data(host_ptr,host_tag);
}
inline void add_cast_time(double t) { _time_cast+=t; }
inline void add_transfer_time(double t) { _time_transfer+=t; }
inline double max_gpu_bytes()
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
inline bool host_view() { return _host_view; }
UCL_Vector<numtyp,numtyp> x;
UCL_Vector<numtyp,numtyp> q;
UCL_Vector<numtyp,numtyp> quat;
UCL_Vector<numtyp,numtyp> v;
#ifdef GPU_CAST
UCL_Vector<double,double> x_cast;
UCL_Vector<int,int> type_cast;
#endif
UCL_D_Vec<unsigned> dev_cell_id;
UCL_D_Vec<int> dev_particle_id;
UCL_D_Vec<tagint> dev_tag;
UCL_H_Vec<int> host_cell_id;
UCL_H_Vec<int> host_particle_id;
UCL_Timer time_pos, time_q, time_quat, time_vel;
UCL_Device *dev;
private:
#ifdef GPU_CAST
UCL_Program *atom_program;
UCL_Kernel k_cast_x;
void compile_kernels(UCL_Device &dev);
#endif
bool _compiled;
bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
bool alloc(const int nall);
bool _allocated, _rot, _charge, _bonds, _vel, _other;
int _max_atoms, _nall, _gpu_nbor;
bool _host_view;
double _time_cast, _time_transfer;
double _max_gpu_bytes;
#ifdef USE_CUDPP
CUDPPConfiguration sort_config;
CUDPPHandle sort_plan;
#endif
};
}
#endif