-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathAlignedCpuArray.h
113 lines (93 loc) · 2.72 KB
/
AlignedCpuArray.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/*
* AlignedCpuArray.h
*
* Created on: Feb 1, 2021
* Author: tugrul
*/
#ifndef ALIGNEDCPUARRAY_H_
#define ALIGNEDCPUARRAY_H_
#include<iostream>
#include <stdexcept>
#include<CL/cl.h>
// storage of an active page
// pinned array is faster in data copying (Page class uses this for all active pages for performance)
// without pinning(page-lock), it still allocates with high alignment value(4096) to keep some performance
// this class meant to be used inside Page class within a smart pointer
template<typename T>
class AlignedCpuArray
{
public:
// ctx: opencl context that belongs to a device(graphics card), used for multiple command queues
// cq: opencl command queue that runs opencl api commands in-order by default
// alignment is only for extra copying performance for large pages (like pinned buffers but less performance)
// pinArray: uses OpenCL implementation to page-lock the memory area. If it doesn't work on a platform, the commented-out mlock/munlock parts can be used instead
AlignedCpuArray(cl_context ctxP, cl_command_queue cqP,size_t sizeP, int alignment=4096, bool pinArray=false):size(sizeP)
{
ctx=ctxP;
cq=cqP;
pinned = pinArray;
if(pinned)
{
// opencl pin-array
cl_int err;
mem=clCreateBuffer(ctx,CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR,size*sizeof(T),nullptr,&err);
if(CL_SUCCESS!=err)
{
throw std::invalid_argument("error: mem alloc host ptr");
}
arr=(T *)clEnqueueMapBuffer(cq,mem,CL_TRUE,CL_MAP_READ|CL_MAP_WRITE,0,size*sizeof(T),0,nullptr,nullptr,&err);
if(CL_SUCCESS!=err)
{
throw std::invalid_argument("error: map");
}
}
else
{
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
// windows
arr = (T *)_aligned_malloc(sizeof(T)*size,alignment);
#else
// linux
arr = (T *)aligned_alloc(alignment,sizeof(T)*size);
#endif
}
}
// returning constant pointer to type T for get/set access (currently only this scalar access is supported. For vectorization, T type needs to contain multiple data)
T * const getArray() const noexcept { return arr; }
~AlignedCpuArray()
{
// todo: don't throw from destructor
if(pinned)
{
// opencl unpin
if(CL_SUCCESS!=clEnqueueUnmapMemObject(cq,mem,arr,0,nullptr,nullptr))
{
std::cout<<"error: mem unmap"<<std::endl;
}
if(CL_SUCCESS!=clReleaseMemObject(mem))
{
std::cout<<"error: release mem"<<std::endl;
}
}
else
{
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
// windows
if(arr!=nullptr)
_aligned_free(arr);
#else
// linux
if(arr!=nullptr)
free(arr);
#endif
}
}
private:
const size_t size;
bool pinned;
cl_context ctx;
cl_command_queue cq;
cl_mem mem;
T * arr;
};
#endif /* ALIGNEDCPUARRAY_H_ */