-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhamr_cuda_kernels.h
174 lines (139 loc) · 4.41 KB
/
hamr_cuda_kernels.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#ifndef hamr_cuda_kernels_h
#define hamr_cuda_kernels_h
#include "hamr_config.h"
#include "hamr_env.h"
#include "hamr_cuda_launch.h"
namespace hamr
{
namespace cuda_kernels
{
/// helpers to get the printf code given a POD type
template <typename T> struct printf_tt {};
#define declare_printf_tt(cpp_t, print_t, code, len)\
/** printf code wrapper for cpp_t */ \
template <> struct printf_tt<cpp_t> \
{ \
/** cast from cpp_t to print_t */ \
__device__ \
static print_t get_value(cpp_t v) \
{ return v; } \
\
/** returns the printf code for cpp_t */ \
__device__ \
static const char *get_code() \
{ return code; } \
\
/** copies the printf code */ \
__device__ \
static void copy_code(char *dest) \
{ \
for (int i = 0; i < len; ++i) \
dest[i] = get_code()[i]; \
} \
\
/** returns the length of the printf code */ \
__device__ \
static int get_code_len() \
{ return len; } \
};
declare_printf_tt(char, int, "%d", 2)
declare_printf_tt(signed char, int, "%d", 2)
declare_printf_tt(unsigned char, unsigned int, "%u", 2)
declare_printf_tt(short, short, "%hd", 3)
declare_printf_tt(unsigned short, unsigned short, "%hu", 3)
declare_printf_tt(int, int, "%d", 2)
declare_printf_tt(unsigned int, unsigned int, "%u", 2)
declare_printf_tt(long, long, "%ld", 3)
declare_printf_tt(unsigned long, unsigned long, "%lu", 3)
declare_printf_tt(long long, long long, "%lld", 4)
declare_printf_tt(unsigned long long, unsigned long long, "%llu", 4)
declare_printf_tt(float, float, "%g", 2)
declare_printf_tt(double, double, "%g", 2)
/// send an array to the stderr stream on the GPU using CUDA
template <typename T>
__global__
void print(const T *vals, size_t n_elem)
{
unsigned long i = hamr::thread_id_to_array_index();
if (i >= n_elem)
return;
int cl = printf_tt<T>::get_code_len();
char fmt[] = "vals[%lu] = XXXXXXXXX"; // <-- 20
printf_tt<T>::copy_code(fmt + 12);
fmt[12 + cl] = '\n';
fmt[13 + cl] = '\0';
printf(fmt, i, printf_tt<T>::get_value(vals[i]));
}
/// copy an array on the GPU using CUDA
template <typename T, typename U>
__global__
void copy(T *dest, const U *src, size_t n_elem)
{
unsigned long i = hamr::thread_id_to_array_index();
if (i >= n_elem)
return;
dest[i] = static_cast<T>(src[i]);
}
/// default construct on the GPU
template <typename T>
__global__
void construct(T *dest, size_t n_elem)
{
unsigned long i = hamr::thread_id_to_array_index();
if (i >= n_elem)
return;
new (&dest[i]) T();
}
/// copy construct on the GPU
template <typename T, typename U>
__global__
void construct(T *dest, size_t n_elem, U val)
{
unsigned long i = hamr::thread_id_to_array_index();
if (i >= n_elem)
return;
new (&dest[i]) T(val);
}
/// copy construct on the GPU
template <typename T, typename U>
__global__
void construct(T *dest, size_t n_elem, const U *vals)
{
unsigned long i = hamr::thread_id_to_array_index();
if (i >= n_elem)
return;
new (&dest[i]) T(vals[i]);
}
/// destruct on the GPU
template <typename T>
__global__
void destruct(T *dest, size_t n_elem)
{
unsigned long i = hamr::thread_id_to_array_index();
if (i >= n_elem)
return;
dest[i].~T();
}
/// initialize an array on the GPU
template <typename T, typename U>
__global__
void fill(T *dest, size_t n_elem, U val)
{
unsigned long i = hamr::thread_id_to_array_index();
if (i >= n_elem)
return;
dest[i] = val;
}
/// initialize an array on the GPU
template <typename T, typename U>
__global__
void fill(T *dest, size_t n_elem, const U *vals)
{
unsigned long i = hamr::thread_id_to_array_index();
if (i >= n_elem)
return;
dest[i] = vals[i];
}
}
}
#endif