lammps-sys 0.6.0

Generates bindings to LAMMPS' C interface (with optional builds from source)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
/* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */

#ifdef FIX_CLASS

FixStyle(INTEL,FixIntel)

#else

#ifndef LMP_FIX_INTEL_H
#define LMP_FIX_INTEL_H

#include "fix.h"
#include "intel_buffers.h"
#include "force.h"
#include "pair.h"
#include "error.h"
#include "update.h"

namespace LAMMPS_NS {

class IntelData;
template <class flt_t, class acc_t> class IntelBuffers;

class FixIntel : public Fix {
 public:
  FixIntel(class LAMMPS *, int, char **);
  virtual ~FixIntel();
  virtual int setmask();
  virtual void init();
  virtual void setup(int);
  inline void min_setup(int in) { setup(in); }
  void setup_pre_reverse(int eflag = 0, int vflag = 0);

  void pair_init_check(const bool cdmessage=false);
  void bond_init_check();
  void kspace_init_check();

  void pre_reverse(int eflag = 0, int vflag = 0);
  inline void min_pre_reverse(int eflag = 0, int vflag = 0)
    { pre_reverse(eflag, vflag); }

  // Get all forces, calculation results from coprocesser
  void sync_coprocessor();

  double memory_usage();

  typedef struct { double x,y,z; } lmp_ft;

  enum {PREC_MODE_SINGLE, PREC_MODE_MIXED, PREC_MODE_DOUBLE};

  inline int precision() { return _precision_mode; }
  inline IntelBuffers<float,float> * get_single_buffers()
    { return _single_buffers; }
  inline IntelBuffers<float,double> * get_mixed_buffers()
    { return _mixed_buffers; }
  inline IntelBuffers<double,double> * get_double_buffers()
    { return _double_buffers; }

  inline int nbor_pack_width() const { return _nbor_pack_width; }
  inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
  inline int three_body_neighbor() { return _three_body_neighbor; }
  inline void three_body_neighbor(const int i) { _three_body_neighbor = i; }

  inline int need_zero(const int tid) {
    if (_need_reduce == 0 && tid > 0) return 1;
    else if (_zero_master && tid == 0) { _zero_master = 0; return 1; }
    else return 0;
  }
  inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
  inline int lrt() {
    if (force->kspace_match("pppm/intel", 0) && update->whichflag == 1)
      return _lrt;
    else return 0;
  }
  inline int pppm_table() {
    if (force->kspace_match("pppm/intel", 0) ||
        force->kspace_match("pppm/disp/intel",0))
      return INTEL_P3M_TABLE;
    else return 0;
  }


 protected:
  IntelBuffers<float,float> *_single_buffers;
  IntelBuffers<float,double> *_mixed_buffers;
  IntelBuffers<double,double> *_double_buffers;

  int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
  int _pair_intel_count, _pair_hybrid_flag;
  // These should be removed in subsequent update w/ simpler hybrid arch
  int _pair_hybrid_zero, _hybrid_nonpair, _zero_master;

 public:
  inline int* get_overflow_flag() { return _overflow_flag; }
  inline int* get_off_overflow_flag() { return _off_overflow_flag; }
  inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
                               double *ev_in, const int offload,
                               const int eatom = 0, const int vatom = 0,
                               const int rflag = 0);
  inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
                               double *ev_in, const int offload,
                               const int eatom = 0, const int vatom = 0,
                               const int rflag = 0);
  inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
                               float *ev_in, const int offload,
                               const int eatom = 0, const int vatom = 0,
                               const int rflag = 0);
  inline void get_buffern(const int offload, int &nlocal, int &nall,
                          int &minlocal);

  #ifdef _LMP_INTEL_OFFLOAD
  void post_force(int vflag);
  inline int coprocessor_number() { return _cop; }
  inline int full_host_list() { return _full_host_list; }
  void set_offload_affinity();
  inline double offload_balance() { return _offload_balance; }
  inline int offload_end_neighbor();
  inline int offload_end_pair();
  inline int host_start_neighbor()
    { if (_offload_noghost) return 0; else return offload_end_neighbor(); }
  inline int host_start_pair()
    { if (_offload_noghost) return 0; else return offload_end_pair(); }
  inline int offload_nlocal() { return _offload_nlocal; }
  inline int offload_nall() { return _offload_nall; }
  inline int offload_min_ghost() { return _offload_min_ghost; }
  inline int host_min_local() { return _host_min_local; }
  inline int host_min_ghost() { return _host_min_ghost; }
  inline int host_used_local() { return _host_used_local; }
  inline int host_used_ghost() { return _host_used_ghost; }
  inline int host_nall() { return _host_nall; }
  inline int separate_buffers() { return _separate_buffers; }
  inline int offload_noghost() { return _offload_noghost; }
  inline void set_offload_noghost(const int v)
    { if (_offload_ghost < 0) _offload_noghost = v; }
  inline void set_neighbor_host_sizes();

  inline void zero_timers()
    { memset(_timers, 0, sizeof(double) * NUM_ITIMERS); }
  inline void start_watch(const int which) { _stopwatch[which] = MPI_Wtime(); }
  inline double stop_watch(const int which);
  inline double * off_watch_pair() { return _stopwatch_offload_pair; }
  inline double * off_watch_neighbor() { return _stopwatch_offload_neighbor; }
  inline void balance_stamp();
  inline void acc_timers();
  #else
  inline int offload_end_neighbor() { return 0; }
  inline int offload_end_pair() { return 0; }
  inline int host_start_neighbor() { return 0; }
  inline int host_start_pair() { return 0; }
  inline void zero_timers() {}
  inline void start_watch(const int /*which*/) {}
  inline double stop_watch(const int /*which*/) { return 0.0; }
  double * off_watch_pair() { return NULL; }
  double * off_watch_neighbor() { return NULL; }
  inline void balance_stamp() {}
  inline void acc_timers() {}
  inline int separate_buffers() { return 0; }
  #endif

 protected:
  int _overflow_flag[5];
  _alignvar(int _off_overflow_flag[5],64);
  int _allow_separate_buffers, _offload_ghost, _lrt;

  IntelBuffers<float,float>::vec3_acc_t *_force_array_s;
  IntelBuffers<float,double>::vec3_acc_t *_force_array_m;
  IntelBuffers<double,double>::vec3_acc_t *_force_array_d;
  float *_ev_array_s;
  double *_ev_array_d;
  int _results_eatom, _results_vatom;
  int _need_reduce;

  #ifdef _LMP_INTEL_OFFLOAD
  double _balance_pair_time, _balance_other_time;
  int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
  int _host_min_local, _host_min_ghost, _host_nall;
  int _host_used_local, _host_used_ghost, _sync_mode;
  int _separate_buffers, _offload_noghost, _separate_coi;
  bool _setup_time_cleared, _timers_allocated;
  void output_timing_data();
  FILE *_tscreen;

  IntelBuffers<float,float>::vec3_acc_t *_off_force_array_s;
  IntelBuffers<float,double>::vec3_acc_t *_off_force_array_m;
  IntelBuffers<double,double>::vec3_acc_t *_off_force_array_d;
  float *_off_ev_array_s;
  double *_off_ev_array_d;
  int _off_results_eatom, _off_results_vatom;
  int _full_host_list, _cop, _ncops;

  int get_ppn(int &);
  int set_host_affinity(const int);
  #endif
  void check_neighbor_intel();

  double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
  double _timers[NUM_ITIMERS];
  double _stopwatch[NUM_ITIMERS];
  _alignvar(double _stopwatch_offload_neighbor[1],64);
  _alignvar(double _stopwatch_offload_pair[1],64);

  void _sync_main_arrays(const int prereverse);

  template <class ft>
  void reduce_results(ft * _noalias const f_in);

  template <class ft, class acc_t>
  inline void add_results(const ft * _noalias const f_in,
                          const acc_t * _noalias const ev_global,
                          const int eatom, const int vatom,
                          const int offload);

  template <class ft, class acc_t>
  inline void add_oresults(const ft * _noalias const f_in,
                           const acc_t * _noalias const ev_global,
                           const int eatom, const int vatom,
                           const int out_offset, const int nall);

  int _offload_affinity_balanced, _offload_threads, _offload_tpc;
  #ifdef _LMP_INTEL_OFFLOAD
  int _max_offload_threads, _offload_cores, _offload_affinity_set;
  int _im_real_space_task;
  MPI_Comm _real_space_comm;
  template <class ft, class acc_t>
  inline void add_off_results(const ft * _noalias const f_in,
                              const acc_t * _noalias const ev_global);
  #endif
};

/* ---------------------------------------------------------------------- */

void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
                           int &minlocal) {
  #ifdef _LMP_INTEL_OFFLOAD
  if (_separate_buffers) {
    if (offload) {
      if (neighbor->ago != 0) {
        nlocal = _offload_nlocal;
        nall = _offload_nall;
      } else {
        nlocal = atom->nlocal;
        nall = nlocal + atom->nghost;
      }
      minlocal = 0;
    } else {
      nlocal = atom->nlocal;
      nall = _host_nall;
      if (force->newton)
        minlocal = _host_min_local;
      else
        minlocal = host_start_pair();
    }
    return;
  }
  if (_offload_noghost && offload)
    nall = atom->nlocal;
  else
  #endif
    nall = atom->nlocal + atom->nghost;
  nlocal = atom->nlocal;
  minlocal = 0;
}

/* ---------------------------------------------------------------------- */

void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
                                double *ev_in, const int offload,
                                const int eatom, const int vatom,
                                const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
    _off_results_vatom = vatom;
    _off_force_array_d = f_in;
    _off_ev_array_d = ev_in;
    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
       _sync_main_arrays(1);
    return;
  }
  #endif

  _force_array_d = f_in;
  _ev_array_d = ev_in;
  _results_eatom = eatom;
  _results_vatom = vatom;
  #ifndef _LMP_INTEL_OFFLOAD
  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
  #endif

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");

  if (_pair_hybrid_flag > 1 ||
      (_pair_hybrid_flag && force->pair->fdotr_is_set())) _sync_main_arrays(0);
}

/* ---------------------------------------------------------------------- */

void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
                                double *ev_in, const int offload,
                                const int eatom, const int vatom,
                                const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
    _off_results_vatom = vatom;
    _off_force_array_m = f_in;
    _off_ev_array_d = ev_in;
    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
       _sync_main_arrays(1);
    return;
  }
  #endif

  _force_array_m = f_in;
  _ev_array_d = ev_in;
  _results_eatom = eatom;
  _results_vatom = vatom;
  #ifndef _LMP_INTEL_OFFLOAD
  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
  #endif

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");

  if (_pair_hybrid_flag > 1 ||
      (_pair_hybrid_flag && force->pair->fdotr_is_set()))
    _sync_main_arrays(0);
}

/* ---------------------------------------------------------------------- */

void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
                                float *ev_in, const int offload,
                                const int eatom, const int vatom,
                                const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
    _off_results_vatom = vatom;
    _off_force_array_s = f_in;
    _off_ev_array_s = ev_in;
    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
       _sync_main_arrays(1);
    return;
  }
  #endif

  _force_array_s = f_in;
  _ev_array_s = ev_in;
  _results_eatom = eatom;
  _results_vatom = vatom;
  #ifndef _LMP_INTEL_OFFLOAD
  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
  #endif

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");

  if (_pair_hybrid_flag > 1 ||
      (_pair_hybrid_flag && force->pair->fdotr_is_set()))
    _sync_main_arrays(0);
}

/* ---------------------------------------------------------------------- */

#ifdef _LMP_INTEL_OFFLOAD

/* ---------------------------------------------------------------------- */

int FixIntel::offload_end_neighbor() {
  if (_offload_balance < 0.0) {
    if (atom->nlocal < 2)
      error->one(FLERR,"Too few atoms for load balancing offload");
    double granularity = 1.0 / atom->nlocal;
    if (_balance_neighbor < granularity)
      _balance_neighbor = granularity + 1e-10;
    else if (_balance_neighbor > 1.0 - granularity)
      _balance_neighbor = 1.0 - granularity + 1e-10;
  }
  return _balance_neighbor * atom->nlocal;
}

int FixIntel::offload_end_pair() {
  if (neighbor->ago == 0) return _balance_neighbor * atom->nlocal;
  else return _balance_pair * atom->nlocal;
}

/* ---------------------------------------------------------------------- */

double FixIntel::stop_watch(const int which) {
  double elapsed = MPI_Wtime() - _stopwatch[which];
  _timers[which] += elapsed;
  return elapsed;
}

/* ---------------------------------------------------------------------- */

void FixIntel::balance_stamp() {
  if (_offload_balance < 0.0) {
    double ct = MPI_Wtime();
    _balance_other_time = ct;
    _balance_pair_time = ct - _stopwatch[TIME_HOST_PAIR];
  }
}

/* ---------------------------------------------------------------------- */

void FixIntel::acc_timers() {
  _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
  if (neighbor->ago == 0) {
    _timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor;
    if (_setup_time_cleared == false) {
      zero_timers();
      _setup_time_cleared = true;
    }
  }
}

/* ---------------------------------------------------------------------- */

void FixIntel::set_neighbor_host_sizes() {
  _host_min_local = _overflow_flag[LMP_LOCAL_MIN];
  _host_min_ghost = _overflow_flag[LMP_GHOST_MIN];
  _host_used_local = atom->nlocal - _host_min_local;
  _host_used_ghost = _overflow_flag[LMP_GHOST_MAX] + 1 - _host_min_ghost;
  if (_host_used_ghost < 0) _host_used_ghost = 0;
  _host_nall = atom->nlocal + _host_used_ghost;
}

/* ---------------------------------------------------------------------- */

#endif

}

#endif
#endif

/* ERROR/WARNING messages:

E: The 'package intel' command is required for /intel styles

Self-explanatory.

W: Could not set host affinity for offload tasks

When using offload to a coprocessor, the application will try to set affinity
for host MPI tasks and OpenMP threads and will generate a warning if unable
to do so successfully. In the unsuccessful case, you might wish to set
affinity outside of the application and performance might suffer if
hyperthreading is disable on the CPU.

E: Neighbor list overflow, boost neigh_modify one

Increase the value for neigh_modify one to allow for larger allocations for
neighbor list builds. The value required can be different for the Intel
package in order to support offload to a coprocessor.

E: Bad matrix inversion in mldivide3

This error should not occur unless the matrix is badly formed.

E: Illegal package intel command

The format for the package intel command is incorrect. Please see the
documentation.

E: fix intel has to operate on group 'all'

Self explanatory.

E: Illegal package intel mode requested

The format for the package intel command is incorrect. Please see the
documentation.

E: Currently, neighbor style BIN must be used with Intel package.

This is the only neighbor style that has been implemented for the Intel
package.

E: Currently, cannot use neigh_modify exclude with Intel package offload.

This is a current restriction of the Intel package when built for offload.

W: Unknown Intel Compiler Version

The compiler version used to build LAMMPS has not been tested with
offload to a coprocessor.

W: Unsupported Intel Compiler

The compiler version used to build LAMMPS is not supported when using
offload to a coprocessor. There could be performance or correctness
issues. Please use 14.0.1.106 or 15.1.133 or later.

E: Currently, cannot offload more than one intel style with hybrid.

Currently, when using offload, hybrid pair styles can only use the intel
suffix for one of the pair styles.

E: Cannot yet use hybrid styles with Intel offload.

The hybrid pair style configuration is not yet supported when using offload
within the Intel package. Support is limited to hybrid/overlay or a hybrid
style that does not require a skip list.

W: Leaving a core/node free can improve performance for offload

When each CPU is fully subscribed with MPI tasks and OpenMP threads,
context switching with threads used for offload can sometimes decrease
performance. If you see this warning, try using fewer MPI tasks/OpenMP threads
per node to leave a physical CPU core free on each node.

E: MPI tasks per node must be multiple of offload_cards

For offload to multiple coprocessors on a single node, the Intel package
requires that each coprocessor is used by the same number of MPI tasks.

W: More MPI tasks/OpenMP threads than available cores

Using more MPI tasks/OpenMP threads than available cores will typically
decrease performance.

E: USER-INTEL package requires same setting for newton bond and non-bond.

The newton setting must be the same for both pairwise and bonded forces.

E: Intel styles for bond/angle/dihedral/improper require intel pair style."

You cannot use the USER-INTEL package for bond calculations without a
USER-INTEL supported pair style.

E: Intel styles for kspace require intel pair style.

You cannot use the USER-INTEL package for kspace calculations without a
USER-INTEL supported pair style.

E: Cannot currently get per-atom virials with intel package.

The Intel package does not yet support per-atom virial calculation.

E: Too few atoms for load balancing offload.

When using offload to a coprocessor, each MPI task must have at least 2
atoms throughout the simulation.

E: Intel package requires fdotr virial with newton on.

This error can occur with a hybrid pair style that mixes styles that are
incompatible with the newton pair setting turned on. Try turning the
newton pair setting off.

E: Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond exclusions with Intel

When using a manybody pair style, bonds/angles/dihedrals, and special_bond
exclusions, LAMMPS should be built with the above compile flag for compatible
results.

*/