-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmat_trans_global.cpp
53 lines (45 loc) · 1.39 KB
/
mat_trans_global.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#include <iostream>
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <cstring>
#include <ctime>
#include <omp.h>
/* Use Matrix Class! */
#include "mat.h"
#include "submat.h"
#define BLOCK_SIZE 32
__global__ void MatTransGlobalKernel( Matrix A, Matrix transA)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
for (int e = 0; e < A.width; e++)
{
float transAValue = 0.0f;
transAValue = A.elements[row * A.width + e];
transA.elements[e * transA.width + col] = transAValue;
}
}
void MatTransGlobal(const Matrix A, Matrix transA)
{
int Gpu = 1, toDev = 1, fromDev = 2;
Matrix d_A(A.width, A.height, 0, Gpu);
d_A.load(A, toDev);
Matrix d_transA(transA.width, transA.height, 0, Gpu);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(A.width / dimBlock.x, A.height / dimBlock.y)
hipEvent_t start, stop;
float elapsed_secs;
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
MatTransGlobalKernel << <dimGrid, dimBlock >> > (d_A, d_transA);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&elapsed_secs, start, stop);
std::cout << " Naive GPU MatMul Time = " << elapsed_secs << "ms" << std::endl;
// Read C from device memory
transA.load(d_transA, fromDev);
// Free device memory
d_A.dealloc(Gpu);
d_transA.dealloc(Gpu);
}