-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmiluphpc.h
515 lines (459 loc) · 18.7 KB
/
miluphpc.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
/**
* @file miluphpc.h
* @brief Right-hand-side implementation and CUDA kernel execution via wrapper functions.
*
* **Abstract base class for integrator classes implementing the right-hand-side
* via modular functions for different parts of the simulation.**
*
* Since dynamic memory allocation and access to heap objects in GPUs are usually suboptimal,
* an array-based data structure is used to store the (pseudo-)particle information as well as the tree
* and allows for efficient cache alignment. Consequently, array indices are used instead of pointers
* to constitute the tree out of the tree nodes, whereas "-1" represents a null pointer and "-2" is
* used for locking nodes. For this purpose an array with the minimum length \f$ 2^{d} \cdot (M - N) \f$
* with dimensionality \f$ d \f$ and number of cells \f$ (M -N) \f$ is needed to store the children.
* The \f$ i \f$-th child of node \f$ c \f$ can therefore be accessed via index \f$ 2^d \cdot c + i \f$.
*
* \image html images/Parallelization/coarse_flow.png width=50%
*
* @author Michael Staneker
* @bug no known bugs
*/
#ifndef MILUPHPC_MILUPHPC_H
#define MILUPHPC_MILUPHPC_H
#include "particle_handler.h"
#include "subdomain_key_tree/tree_handler.h"
#include "subdomain_key_tree/subdomain_handler.h"
#include "device_rhs.cuh"
#include "cuda_utils/cuda_utilities.cuh"
#include "utils/logger.h"
#include "utils/timer.h"
#include "materials/material_handler.h"
#include "helper_handler.h"
#include "gravity/gravity.cuh"
#include "sph/sph.cuh"
#include "cuda_utils/cuda_runtime.h"
#include "utils/cxxopts.h"
#include "utils/h5profiler.h"
#include "sph/kernel.cuh"
#include "sph/kernel_handler.cuh"
#include "sph/density.cuh"
#include "sph/pressure.cuh"
#include "sph/internal_forces.cuh"
#include "sph/soundspeed.cuh"
#include "simulation_time_handler.h"
#include "processing/kernels.cuh"
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <climits> // for ulong_max
#include <algorithm>
#include <cmath>
#include <utility>
#include <set>
#include <fstream>
#include <iomanip>
#include <random>
#include <fstream>
#include <boost/mpi/collectives/all_gatherv.hpp>
#include <highfive/H5File.hpp>
#include <highfive/H5DataSpace.hpp>
#include <highfive/H5DataSet.hpp>
/**
* @brief MilupHPC class
*
* **Abstract base class for integrator classes implementing the right-hand-side
* via modular functions for different parts of the simulation.**
*/
class Miluphpc {
private:
// @todo implement serial versions in order to reduce overhead
// arised from parallelization
//real serial_tree();
//real serial_pseudoParticles();
//real serial_gravity();
//real serial_sph();
/**
* @brief Reset arrays, values, ...
*
* This function embraces resetting the
*
* * pseudo-particles
* * child array
* * the boundaries
* * the domain list nodes
* * the neighbor list
* * and so forth ...
*
* ensuring correct starting conditions.
*
* @return accumulated time of functions within
*/
real reset();
/**
* @brief Calculate bounding boxes/simulation domain.
*
* To derive the bounding boxes of the simulation domain the maximum and minimum value for each coordinate
* axis is determined locally on each process and afterwards reduced to all processes to get the global
* maximum and minimum via `MPI_Allreduce()`.
*
* @return accumulated time of functions within
*/
real boundingBox();
/**
* @brief Parallel version regarding tree-stuff.
*
* @return accumulated time of functions within
*/
real parallel_tree();
/**
* @brief Parallel version regarding computation of pseudo-particles.
*
* @return accumulated time of functions within
*/
real parallel_pseudoParticles();
/**
* @brief Parallel version regarding computation of gravitational stuff.
*
* @return accumulated time of functions within
*/
real parallel_gravity();
/**
* @brief Parallel version regarding computation of SPH-stuff.
*
* @return accumulated time of functions within
*/
real parallel_sph();
// @todo possible to combine sendPartclesEntry and sendParticles
/**
* @brief Send particles/Exchange particles among MPI processes.
*
* @tparam T
* @param sendLengths
* @param receiveLengths
* @param entry
* @param entryBuffer
* @param copyBuffer
* @return
*/
template <typename T>
integer sendParticlesEntry(integer *sendLengths, integer *receiveLengths, T *entry, T *entryBuffer, T *copyBuffer);
//void exchangeParticleEntry(integer *sendLengths, integer *receiveLengths, real *entry);
/**
* @brief Send particles/Exchange particles among MPI processes.
*
* @tparam T
* @param sendBuffer
* @param receiveBuffer
* @param sendLengths
* @param receiveLengths
* @return
*/
template <typename T>
integer sendParticles(T *sendBuffer, T *receiveBuffer, integer *sendLengths, integer *receiveLengths);
/**
* @brief Function to sort an array `entry` in dependence of another array `sortArray`
*
* @tparam U type of the arrays determining the sorting
* @tparam T type of the arrays to be sorted
* @param sortArray array to be sorted (rather sorting behaviour should be determined)
* @param sortedArray sorted array (result of sorting)
* @param entry (relevant) array to be sorted
* @param temp buffer needed for sorting process
* @return accumulated time of functions within
*/
template <typename U, typename T>
real arrangeParticleEntries(U *sortArray, U *sortedArray, T *entry, T *temp);
/**
* @brief Assign particles to correct process in dependence of particle key and ranges.
*
* First an extra array is used to save the information of process assignment for each particle,
* this extra array is used as key for sorting all of the attributes of the Particle class
* and finally the sub-array determined by the process assignment are sent to the corresponding process via MPI.
*
* @return accumulated time of functions within
*/
real assignParticles();
/**
* @brief Calculate the angular momentum for all particles.
*
* @return accumulated time of functions within
*/
real angularMomentum();
/**
* @brief Calculate the total amount of energy.
*
* @return accumulated time of functions within
*/
real energy();
public:
/// current sub-step (there are possibly more sub-steps within a step!)
int subStep;
/// search radius for SPH (MPI-process overarching) neighbor search
real h_searchRadius;
/// total energy
real totalEnergy;
/// total angular momentum
real totalAngularMomentum;
/**
* @brief Remove particles in dependence of some criterion.
*
* Criterion can be specified in the config file.
* Removing particles can be accomplished by marking the particles to be removed via an extra array
* and using this one as key for sorting all the (pseudo-)particle arrays and finally deleting them
* by overwriting those entries with default values.
*
* @return accumulated time of functions within
*/
real removeParticles();
/**
* @brief Load balancing via equidistant ranges.
*/
void fixedLoadBalancing();
/**
* @brief Pre-calculations for `updateRangeApproximately`.
*
* Potential wrapper functions if more *range determination functions* come
* into exist.
*
* @param bins amount of bins the range will be subdivided
*/
void dynamicLoadBalancing(int bins=5000);
/**
* @brief Update the ranges (approximately and dynamically).
*
* A histogram is generated with the amount of bins given by `bins`.
* The possible range is distributed accordingly and particles are sorted
* in dependence of their key into, in order to extract the ranges.
*
* @param aimedParticlesPerProcess aimed amount particles per process
* @param bins amount of bins the range will be subdivided
*/
void updateRangeApproximately(int aimedParticlesPerProcess, int bins=5000);
/**
* @brief Update the range in dependence on number of (MPI) processes and aimed particles per process.
*
* Counting the particles per process, calculating the aimed particles per process as quotient of total number of
* particles and number of processes, determining the particle keys, sorting them and determining the new ranges
* as the indices of the multiplies of the aimed number of particles per process.
*
* @param aimedParticlesPerProcess optimal number of particles per process
*/
void updateRange(int aimedParticlesPerProcess);
/// H5 profiler instance
H5Profiler &profiler = H5Profiler::getInstance("log/performance.h5");
/// Space-filling curve type to be used (Lebesgue or Hilbert)
Curve::Type curveType;
/// Instance to handle the SPH `Kernel` instance on device and host
SPH::KernelHandler kernelHandler;
/// Instance to handle the `SimulationTime` instances on device and host
SimulationTimeHandler *simulationTimeHandler;
/// number of particles (to be allocated)
integer numParticles;
/// (real) number of particles on all processes
integer sumParticles;
/**
* number of particles currently living on the (MPI) process
*
* Temporarily are more particles on the process, e.g. for gravitational
* and SPH (force) calculations.
*/
integer numParticlesLocal;
/// number of nodes (to be allocated)
integer numNodes;
/**
* Instance(s) to handle the `IntegratedParticles` instance(s) on device and host
*
* Memory for this may or may not be allocated within child classes.
* This depends on whether instances are needed for cache particle information
* e.g. for predictor-corrector integrators.
*/
IntegratedParticleHandler *integratedParticles;
integer *d_mutex;
/// Instance to handle the `Particles` instance on device and host
ParticleHandler *particleHandler;
/// Instance to handle the `SubDomainKeyTree` instance on device and host
SubDomainKeyTreeHandler *subDomainKeyTreeHandler;
/// Instance to handle the `Tree` instance on device and host
TreeHandler *treeHandler;
/// Instance to handle the `DomainList` instance on device and host
DomainListHandler *domainListHandler;
/// Instance to handle the (lowest) `DomainList` instance on device and host
DomainListHandler *lowestDomainListHandler;
/// Instance to handle `Materials` instances on device and host
MaterialHandler *materialHandler;
/// @todo revise buffer handling
/// buffer instance
//HelperHandler *helperHandler;
/// buffer instance
HelperHandler *buffer;
// testing
/// buffer (need for revising)
//integer *d_particles2SendIndices;
/// buffer (need for revising)
//integer *d_pseudoParticles2SendIndices;
/// buffer (need for revising)
//integer *d_pseudoParticles2SendLevels;
/// buffer (need for revising)
//integer *d_pseudoParticles2ReceiveLevels;
/// buffer (need for revising)
//integer *d_particles2SendCount;
/// buffer (need for revising)
//integer *d_pseudoParticles2SendCount;
/// buffer (need for revising)
//int *d_particles2removeBuffer;
/// buffer (need for revising)
//int *d_particles2removeVal;
/// buffer (need for revising)
//idInteger *d_idIntegerBuffer;
/// buffer (need for revising)
//idInteger *d_idIntegerCopyBuffer;
// end: testing
/// collected information required to set up the simulation
SimulationParameters simulationParameters;
/**
* @brief Constructor to set up simulation.
*
* @param simulationParameters all the information required to set up simulation
*/
Miluphpc(SimulationParameters simulationParameters);
/**
* @brief Destructor freeing class instances.
*/
~Miluphpc();
/**
* @brief Prepare the simulation, including
*
* * loading the initial conditions
* * copying to device
* * computing the bounding boxes
* * initial ranges (load balancing)
*/
void prepareSimulation();
/**
* @brief Determine amount of particles (`numParticles` and `numParticlesLocal`)
* from initial file/particle distribution file
*
* Since the information of how many particles to be simulated is needed to
* allocate (the right amount of) memory, this function need to be called before
* `distributionFromFile()`
*
* @param filename initial conditions file to be read
*/
void numParticlesFromFile(const std::string& filename);
/**
* @brief Read initial/particle distribution file (in parallel)
*
* @param filename initial conditions file to be read
*/
void distributionFromFile(const std::string& filename);
/**
* @brief Wrapper function for building the tree (and domain tree).
*
* To build the parallel tree it is necessary to determine the domain list or common coarse tree nodes which
* are derived from the ranges. After that, the tree is built locally and finally all the domain list nodes
* or rather those attributes as indices are either saved to an instance of the omainList class
* or if they do not exist after building the tree created and afterwards assigned.
*
* @return accumulated time for functions within
*/
real tree();
/**
* @brief Wrapper function for calculating pseudo-particles
*
* The pseudo-particle computation starts with determining the lowest domain list, this information is stored
* in a separate instance of the DomainList class continues with the local COM computation,
* the resetting of the domain list nodes that are not lowest domain list nodes, the preparation for the
* exchange by copying the information into contiguous memory, subsequent communication via MPI, the updating
* of the lowest domain list nodes and finally ends with the computation of the remaining domain list node
* pseudo-particles.
*
* @return accumulated time for functions within
*/
real pseudoParticles();
/**
* @brief Wrapper function for Gravity-related stuff
*
* The gravitational force computation corresponds to the Barnes-Hut method. First all relevant lowest domain
* list nodes belonging to another process are determined. Then the pseudo-particles and particles to be sent
* can be determined by checking the extended $\theta$-criterion for all nodes and all relevant lowest domain
* list nodes. This is done in an approach where particles are either marked to be sent or not in an extra array
* and afterwards copied to contiguous memory in order to send them via MPI. After receiving, the particles are
* inserted into the local tree, whereas first the pseudo-particles and then the particles are inserted.
* Since complete sub-trees are inserted, no new pseudo-particles are generated during this process. Afterwards,
* the actual force computation can be accomplished locally. Finally the inserted (pseudo-)particles are removed.
*
* \image html images/Parallelization/interactions_gravity.png width=30%
*
* @return accumulated time for functions within
*/
real gravity();
/**
* Wrapper function for SPH-related stuff
*
* Similar to the gravitational force computation is the SPH part. The relevant lowest domain list nodes are
* determined in order to decide which particles need to be sent, which is also done in an analogous approach
* in the gravitational part.
* However, in this case only particles are sent and after
* exchanging those particles, the insertion includes generation of new pseudo-particles similar to building
* or rather extending the local tree. With inserted particles from the other processes the FRNN search can be
* done and subsequent computation of the density, speed of sound and pressure realized.
* The corresponding particle properties are
* sent again, so that the actual force computation can be performed. Finally, the inserted and generated
* (pseudo-)particles are removed.
*
* \image html images/SPH_concept.png width=50%
* \image html images/Parallelization/interactions_sph.png width=30%
*
* @return accumulated time for functions within
*/
real sph();
/**
* Right Hand Side - all of the simulation without integration itself.
*
* This method is used/called by the different integrators in order to
* integrate or execute the simulation.
* The force or acceleration computation and necessary steps to calculate them are encapsulated in this function
* whose individual components and interoperation are depicted in the following.
* This right-hand-side is called once or several times within a (sub-)integration step in dependence of the
* integration scheme. In addition, the function depends on whether the simulation runs single-node or multi-node,
* gravity and/or SPH are included into the simulation and some other conditions.
*
* \image html images/Parallelization/rhs_flow.png width=50%
*
* @param step simulation step, regarding output
* @param selfGravity apply self-gravity y/n (needed since gravity is decoupled)
* @param assignParticlesToProcess assign particles to correct process y/n
* @return accumulated time for functions within
*/
real rhs(int step, bool selfGravity=true, bool assignParticlesToProcess=true);
/**
* Abstract method for integration. Implemented by integration classes,
* calling `rhs()` different times
* @param step
*/
virtual void integrate(int step = 0) = 0;
/**
* Function to be called after successful integration (step)
*/
void afterIntegrationStep();
/**
* Write all of the information/particle distribution to H5 file
*
* @param step simulation step, used to name file
* @return accumulated time for functions within
*/
real particles2file(int step);
/**
* Write particles or rather particle's information/attributes to file. Particles to be outputted
* determined by `particleIndices`
*
* @param filename file name of output H5 file
* @param particleIndices particle's indices to be outputted
* @param length amount of particles to be outputted
* @return accumulated time for functions within
*/
real particles2file(const std::string& filename, int *particleIndices, int length);
void getMemoryInfo();
};
#endif //MILUPHPC_MILUPHPC_H