1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
/***************************************************************************
device.h
-------------------
W. Michael Brown (ORNL)
Class for management of the device where the computations are performed
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_DEVICE_H
#define LAL_DEVICE_H
#include "lal_atom.h"
#include "lal_answer.h"
#include "lal_neighbor.h"
#include "lal_pppm.h"
#include "mpi.h"
#include <sstream>
#include <cstdio>
#include <string>
#include <queue>
namespace LAMMPS_AL {
template <class numtyp, class acctyp,
class grdtyp, class grdtyp4> class PPPM;
template <class numtyp, class acctyp>
class Device {
public:
Device();
~Device();
/// Initialize the device for use by this process
/** Sets up a per-device MPI communicator for load balancing and initializes
* the device (>=first_gpu and <=last_gpu) that this proc will be using
* Returns:
* - 0 if successfull
* - -2 if GPU not found
* - -4 if GPU library not compiled for GPU
* - -6 if GPU could not be initialized for use
* - -7 if accelerator sharing is not currently allowed on system
* - -11 if vendor_string has the wrong number of parameters **/
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom, const double cell_size,
char *vendor_string, const int block_pair);
/// Initialize the device for Atom storage
/** \param charge True if charges need to be stored
* \param rot True if quaternions need to be stored
* \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
* \param maxspecial Maximum mumber of special bonded atoms per atom
* \param vel True if velocities need to be stored
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
const int nlocal, const int nall, const int maxspecial,
const bool vel=false);
/// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
/// Initialize the neighbor list storage
/** \param charge True if charges need to be stored
* \param rot True if quaternions need to be stored
* \param nlocal Total number of local particles to allocate memory for
* \param host_nlocal Initial number of host particles to allocate memory for
* \param nall Total number of local+ghost particles
* \param maxspecial Maximum mumber of special bonded atoms per atom
* \param gpu_host 0 if host will not perform force calculations,
* 1 if gpu_nbor is true, and host needs a half nbor list,
* 2 if gpu_nbor is true, and host needs a full nbor list
* \param max_nbors Initial number of rows in the neighbor matrix
* \param cell_size cutoff+skin
* \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel
* \param threads_per_atom value to be used by the neighbor list only
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_nbor(Neighbor *nbor, const int nlocal,
const int host_nlocal, const int nall,
const int maxspecial, const int gpu_host,
const int max_nbors, const double cell_size,
const bool pre_cut, const int threads_per_atom);
/// Output a message for pair_style acceleration with device stats
void init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu);
/// Perform charge assignment asynchronously for PPPM
void set_single_precompute(PPPM<numtyp,acctyp,
float,_lgpu_float4> *pppm);
/// Perform charge assignment asynchronously for PPPM
void set_double_precompute(PPPM<numtyp,acctyp,
double,_lgpu_double4> *pppm);
/// Esimate the overhead from GPU calls from multiple procs
/** \param kernel_calls Number of kernel calls/timestep for timing estimated
* overhead
* \param gpu_overhead Estimated gpu overhead per timestep (sec)
* \param driver_overhead Estimated overhead from driver per timestep (s) **/
void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
double &gpu_driver_overhead);
/// Returns true if double precision is supported on card
inline bool double_precision() { return gpu->double_precision(); }
/// Output a message with timing information
void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
Neighbor &nbor, const double avg_split,
const double max_bytes, const double gpu_overhead,
const double driver_overhead,
const int threads_per_atom, FILE *screen);
/// Output a message with timing information
void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
UCL_Timer & time_map, UCL_Timer & time_rho,
UCL_Timer &time_interp,
Answer<numtyp,acctyp> &ans,
const double max_bytes, const double cpu_time,
const double cpu_idle_time, FILE *screen);
/// Clear all memory on host and device associated with atom and nbor data
void clear();
/// Clear all memory on host and device
void clear_device();
/// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
inline void add_ans_object(Answer<numtyp,acctyp> *ans)
{ ans_queue.push(ans); }
/// Add "answers" (force,energies,etc.) into LAMMPS structures
inline double fix_gpu(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
atom.data_unavail();
if (ans_queue.empty()==false) {
stop_host_timer();
double evdw=0.0;
while (ans_queue.empty()==false) {
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
ans_queue.pop();
}
return evdw;
}
return 0.0;
}
/// Start timer on host
inline void start_host_timer()
{ _cpu_full=MPI_Wtime(); _host_timer_started=true; }
/// Stop timer on host
inline void stop_host_timer() {
if (_host_timer_started) {
_cpu_full=MPI_Wtime()-_cpu_full;
_host_timer_started=false;
}
}
/// Return host time
inline double host_time() { return _cpu_full; }
/// Return host memory usage in bytes
double host_memory_usage() const;
/// Return the number of procs sharing a device (size of device communicator)
inline int procs_per_gpu() const { return _procs_per_gpu; }
/// Return the number of threads per proc
inline int num_threads() const { return _nthreads; }
/// My rank within all processes
inline int world_me() const { return _world_me; }
/// Total number of processes
inline int world_size() const { return _world_size; }
/// MPI Barrier for world
inline void world_barrier() { MPI_Barrier(_comm_world); }
/// Return the replica MPI communicator
inline MPI_Comm & replica() { return _comm_replica; }
/// My rank within replica communicator
inline int replica_me() const { return _replica_me; }
/// Number of procs in replica communicator
inline int replica_size() const { return _replica_size; }
/// Return the per-GPU MPI communicator
inline MPI_Comm & gpu_comm() { return _comm_gpu; }
/// Return my rank in the device communicator
inline int gpu_rank() const { return _gpu_rank; }
/// MPI Barrier for gpu
inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
/// Return the 'mode' for acceleration: GPU_FORCE, GPU_NEIGH or GPU_HYB_NEIGH
inline int gpu_mode() const { return _gpu_mode; }
/// Index of first device used by a node
inline int first_device() const { return _first_device; }
/// Index of last device used by a node
inline int last_device() const { return _last_device; }
/// Particle split defined in fix
inline double particle_split() const { return _particle_split; }
/// Return the initialization count for the device
inline int init_count() const { return _init_count; }
/// True if device is being timed
inline bool time_device() const { return _time_device; }
/// Return the number of threads accessing memory simulatenously
inline int num_mem_threads() const { return _num_mem_threads; }
/// Return the number of threads per atom for pair styles
inline int threads_per_atom() const { return _threads_per_atom; }
/// Return the number of threads per atom for pair styles using charge
inline int threads_per_charge() const { return _threads_per_charge; }
/// Return the min of the pair block size or the device max block size
inline int pair_block_size() const { return _block_pair; }
/// Return the maximum number of atom types that can be used with shared mem
inline int max_shared_types() const { return _max_shared_types; }
/// Return the maximum order for PPPM splines
inline int pppm_max_spline() const { return _pppm_max_spline; }
/// Return the block size for PPPM kernels
inline int pppm_block() const { return _pppm_block; }
/// Return the block size for neighbor binning
inline int block_cell_2d() const { return _block_cell_2d; }
/// Return the block size for atom mapping for neighbor builds
inline int block_cell_id() const { return _block_cell_id; }
/// Return the block size for neighbor build kernel
inline int block_nbor_build() const { return _block_nbor_build; }
/// Return the block size for "bio" pair styles
inline int block_bio_pair() const { return _block_bio_pair; }
/// Return the block size for "ellipse" pair styles
inline int block_ellipse() const { return _block_ellipse; }
/// Return the maximum number of atom types for shared mem with "bio" styles
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
/// Architecture gpu code compiled for (returns 0 for OpenCL)
inline double ptx_arch() const { return _ptx_arch; }
/// Number of threads executing concurrently on same multiproc
inline int warp_size() const { return _warp_size; }
// -------------------- SHARED DEVICE ROUTINES --------------------
// Perform asynchronous zero of integer array
void zero(UCL_D_Vec<int> &mem, const int numel) {
int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
_block_pair));
k_zero.set_size(num_blocks,_block_pair);
k_zero.run(&mem,&numel);
}
// -------------------------- DEVICE DATA -------------------------
/// Geryon Device
UCL_Device *gpu;
enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
// --------------------------- ATOM DATA --------------------------
/// Atom Data
Atom<numtyp,acctyp> atom;
// --------------------------- NBOR SHARED KERNELS ----------------
/// Shared kernels for neighbor lists
NeighborShared _neighbor_shared;
// ------------------------ LONG RANGE DATA -----------------------
// Long Range Data
int _long_range_precompute;
PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
/// Precomputations for long range charge assignment (asynchronously)
inline void precompute(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *charge, double *boxlo, double *prd) {
if (_long_range_precompute==1)
pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
boxlo,prd);
else if (_long_range_precompute==2)
pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
boxlo,prd);
}
inline std::string compile_string() { return _ocl_compile_string; }
private:
std::queue<Answer<numtyp,acctyp> *> ans_queue;
int _init_count;
bool _device_init, _host_timer_started, _time_device;
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
_replica_size;
int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
double _particle_split;
double _cpu_full;
double _ptx_arch;
double _cell_size; // -1 if the cutoff is used
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
int _pppm_max_spline, _pppm_block;
int _block_pair, _block_ellipse, _max_shared_types;
int _block_cell_2d, _block_cell_id, _block_nbor_build;
int _block_bio_pair, _max_bio_shared_types;
UCL_Program *dev_program;
UCL_Kernel k_zero, k_info;
bool _compiled;
int compile_kernels();
int _data_in_estimate, _data_out_estimate;
std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
int set_ocl_params(char *);
template <class t>
inline std::string toa(const t& in) {
std::ostringstream o;
o.precision(2);
o << in;
return o.str();
}
};
}
#endif