-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhamr_cuda_launch.h
108 lines (95 loc) · 3.87 KB
/
hamr_cuda_launch.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#ifndef hamr_cuda_launch_h
#define hamr_cuda_launch_h
/// @file
#include "hamr_config.h"
#include <deque>
#include <cuda.h>
#include <cuda_runtime.h>
/// heterogeneous accelerator memory resource
namespace hamr
{
/** A flat array is broken into blocks of number of threads where each adjacent
* thread accesses adjacent memory locations. To accomplish this we might need
* a large number of blocks. If the number of blocks exceeds the max block
* dimension in the first and or second block grid dimension then we need to
* use a 2d or 3d block grid.
*
* ::partition_thread_blocks - decides on a partitioning of the data based on
* warps_per_block parameter. The resulting decomposition will be either 1,2,
* or 3D as needed to accommodate the number of fixed sized blocks. It can
* happen that max grid dimensions are hit, in which case you'll need to
* increase the number of warps per block.
*
* ::thread_id_to_array_index - given a thread and block id gets the
* array index to update. _this may be out of bounds so be sure
* to validate before using it.
*
* ::index_is_valid - test an index for validity.
*/
/// @name CUDA indexing scheme
///@{
/// query properties for the named CUDA device. retruns non-zero on error
HAMR_EXPORT
int get_launch_props(int device_id,
int *block_grid_max, int &warp_size,
int &max_warps_per_block);
/** convert a CUDA index into a flat array index using the partitioning scheme
* defined in partition_thread_blocks
*/
inline
__device__
unsigned long thread_id_to_array_index()
{
return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x
+ blockIdx.z * gridDim.x * gridDim.y);
}
/// bounds check the flat index
inline
__device__
int index_is_valid(unsigned long index, unsigned long max_index)
{
return index < max_index;
}
/** Calculate CUDA launch parameters for an arbitrarily large flat array.
*
* @param[in] device_id the CUDA device to use. Default values for
* warps_per_block and block_grid_max are determined by
* querying the capabilities of the device. If -1 is
* passed then the currently active device is used.
* @param[in] array_size the length of the array being processed
* @param[in] warps_per_block number of warps to use per block (your choice).
* Using a larger number here will result in fewer
* blocks being processed concurrently.
*
* @param[out] block_grid block dimension kernel launch control
* @param[out] n_blocks number of blocks
* @param[out] thread_grid thread dimension kernel launch control
*
* @returns zero if successful and non-zero if an error occurred
*/
HAMR_EXPORT
int partition_thread_blocks(int device_id, size_t array_size,
int warps_per_block, dim3 &block_grid, int &n_blocks,
dim3 &thread_grid);
/** Calculate CUDA launch parameters for an arbitrarily large flat array. See
* ::get_launch_props for determining the correct values for warp_size and
* block_grid_max.
*
* @param[in] array_size The length of the array being processed
* @param[in] warp_size The number of threads per warp supported on the device
* @param[in] warps_per_block The number of warps to use per block (your choice)
* @param[in] block_grid_max The maximum number of blocks, in 3-dimensions,
* supported by the device
* @param[out] block_grid The block grid dimension kernel launch control parameter
* @param[out] n_blocks The total number of blocks that will be launched
* @param[out] thread_grid The thread grid dimension kernel launch control parameter
*
* @returns zero if successful and non-zero if an error occurred
*/
HAMR_EXPORT
int partition_thread_blocks(size_t array_size,
int warps_per_block, int warp_size, int *block_grid_max,
dim3 &block_grid, int &n_blocks, dim3 &thread_grid);
}
///@}
#endif