-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranspose02.s
108 lines (92 loc) · 2.7 KB
/
transpose02.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
//
// Transform n c h w to c n h w
//
// Pass in two pointers
/*
template<int n, int c, int h, int w>
__global__ void Kernel(float4_t* output, float4_t* input) {
int in_id = threadIdx.x + blockIdx.y * h * w + blockIdx.z * c * h * w;
float4_t val = input[in_id];
int out_id1 = blockIdx.y * n * h * w + blockIdx.z * h * w + threadIdx.x * c;
int out_id2 = out_id1 + n * h * w;
int out_id3 = out_id2 + n * h * w;
int out_id4 = out_id3 + n * h * w;
float* output_scalar_ptr = reinterpret_cast<float*>(output);
output_scalar_ptr[out_id1] = val.x;
output_scalar_ptr[out_id2] = val.y;
output_scalar_ptr[out_id3] = val.z;
output_scalar_ptr[out_id4] = val.w;
}
*/
// 1024 float4s per workgroup
// h = 64, w = 64
// h * w = 4096
// c = 128
// n = 64
.hsa_code_object_version 2,1
.hsa_code_object_isa 9,0,0,"AMD","AMDGPU"
.p2align 8
.type hello_world,@function
.amdgpu_hsa_kernel hello_world
hello_world:
.Lfunc_begin0:
.amd_kernel_code_t
amd_code_version_major = 1
amd_code_version_minor = 1
amd_machine_kind = 1
amd_machine_version_major = 9
amd_machine_version_minor = 0
amd_machine_version_stepping = 0
granulated_workitem_vgpr_count = 5
granulated_wavefront_sgpr_count = 5
enable_sgpr_kernarg_segment_ptr = 1
float_mode = 192
enable_ieee_mode = 1
enable_trap_handler = 1
is_ptr64 = 1
compute_pgm_rsrc2_user_sgpr = 2
kernarg_segment_byte_size = 16
wavefront_sgpr_count = 10
workitem_vgpr_count = 20
enable_sgpr_workgroup_id_x = 1
.end_amd_kernel_code_t
s_load_dwordx2 s[2:3], s[0:1] 0x0
s_mov_b32 s5, s2
s_mov_b32 s6, s3
s_mov_b32 s7, s4
s_load_dwordx2 s[0:1], s[0:1] 0x8
// threadIdx.x + blockIdx.y * h * w + blockIdx.z * c * h * w
s_mul_i32 s8, 64*64, s6
s_mul_i32 s9, 128*64*64, s7
s_add_i32 s8, s8, s9
v_add_u32 v1, s8, v0
v_lshlrev_b32 v1, 4, v1
// threadIdx.x + blockIdx.y * n * h * w + blockIdx.z * h * w
s_mul_i32 s8, s6, 64*64*64
s_mul_i32 s9, s7, 64*64
s_add_i32 s9, s8, s9
v_add_u32 v2, v0, s9
v_lshlrev_b32 v2, 2, v2
s_waitcnt lgkmcnt(1)
v_mov_b32 v4, s3
v_add_co_u32 v3, vcc, v1, s2
v_addc_co_u32 v4, vcc, v4, 0, vcc
flat_load_dwordx4 v[5:5+3], v[3:4]
s_waitcnt lgkmcnt(0)
v_mov_b32 v10, s1
v_add_co_u32 v9, vcc, v2, s0
v_addc_co_u32 v10, vcc, v10, 0, vcc
v_add_co_u32 v11, vcc, v9, 64*64*64
v_addc_co_u32 v12, vcc, v10, 0, vcc
v_add_co_u32 v13, vcc, v11, 64*64*64
v_addc_co_u32 v14, vcc, v12, 0, vcc
v_add_co_u32 v15, vcc, v13, 64*64*64
v_addc_co_u32 v16, vcc, v14, 0, vcc
s_waitcnt vmcnt(0)
flat_store_dword v5, v[9:10]
flat_store_dword v6, v[11:12]
flat_store_dword v7, v[13:14]
flat_store_dword v8, v[15:16]
s_endpgm
.Lfunc_end0:
.size hello_world, .Lfunc_end0-hello_world