diff --git a/src/force/force.cu b/src/force/force.cu index 3313e9257..2329ebb24 100644 --- a/src/force/force.cu +++ b/src/force/force.cu @@ -227,18 +227,12 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double* __syncthreads(); #pragma unroll - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_f[tid] += s_f[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_f[tid] += s_f[tid + offset]; - } - __syncwarp(); - } if (tid == 0) { g_f[bid] = s_f[0]; @@ -654,18 +648,12 @@ static __global__ void gpu_sum_tensor(int N, double* g_tensor, double* g_sum_ten __syncthreads(); #pragma unroll - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_t[tid] += s_t[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_t[tid] += s_t[tid + offset]; - } - __syncwarp(); - } if (tid == 0) { g_sum_tensor[bid] = s_t[0]; diff --git a/src/integrate/ensemble_pimd.cu b/src/integrate/ensemble_pimd.cu index d6114f77b..7e67a18bc 100644 --- a/src/integrate/ensemble_pimd.cu +++ b/src/integrate/ensemble_pimd.cu @@ -371,7 +371,7 @@ gpu_find_momentum_beads(const int number_of_atoms, const double* g_mass, double* } __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { for (int d = 0; d < 4; ++d) { s_momentum[d][tid] += s_momentum[d][tid + offset]; @@ -379,14 +379,6 @@ gpu_find_momentum_beads(const int number_of_atoms, const double* g_mass, double* } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - for (int d = 0; d < 4; ++d) { - s_momentum[d][tid] += s_momentum[d][tid + offset]; - } - } - __syncwarp(); - } if (tid == 0) { for (int d = 0; d < 4; ++d) { diff --git a/src/integrate/langevin_utilities.cuh b/src/integrate/langevin_utilities.cuh index b1875b282..dd186d87d 100644 --- a/src/integrate/langevin_utilities.cuh +++ b/src/integrate/langevin_utilities.cuh @@ -97,18 +97,12 @@ static __global__ void gpu_find_momentum( s_momentum[tid] = momentum; __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_momentum[tid] += s_momentum[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_momentum[tid] += s_momentum[tid + offset]; - } - __syncwarp(); - } if (tid == 0) { device_momentum[bid] = s_momentum[0]; diff --git a/src/main_gpumd/add_random_force.cu b/src/main_gpumd/add_random_force.cu index 662d1d242..d3672a2e3 100644 --- a/src/main_gpumd/add_random_force.cu +++ b/src/main_gpumd/add_random_force.cu @@ -89,18 +89,12 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double* __syncthreads(); #pragma unroll - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_f[tid] += s_f[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_f[tid] += s_f[tid + offset]; - } - __syncwarp(); - } if (tid == 0) { device_total_force[bid] = s_f[0]; diff --git a/src/main_gpumd/electron_stop.cu b/src/main_gpumd/electron_stop.cu index 487f0d6a4..7e12e5000 100644 --- a/src/main_gpumd/electron_stop.cu +++ b/src/main_gpumd/electron_stop.cu @@ -94,18 +94,12 @@ static __global__ void find_force_average(int num_atoms, double* g_force) s_f[tid] = f; __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_f[tid] += s_f[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_f[tid] += s_f[tid + offset]; - } - __syncwarp(); - } if (tid == 0) { device_force_average[bid] = s_f[0] / num_atoms; diff --git a/src/main_nep/dataset.cu b/src/main_nep/dataset.cu index 4a33849cd..06ded508e 100644 --- a/src/main_nep/dataset.cu +++ b/src/main_nep/dataset.cu @@ -380,20 +380,13 @@ static __global__ void gpu_sum_force_error( } __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_error[tid] += s_error[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_error[tid] += s_error[tid + offset]; - } - __syncwarp(); - } - if (tid == 0) { error_gpu[bid] = s_error[0]; } @@ -456,20 +449,13 @@ gpu_get_energy_shift(int* g_Na, int* g_Na_sum, float* g_pe, float* g_pe_ref, flo } __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_pe[tid] += s_pe[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_pe[tid] += s_pe[tid + offset]; - } - __syncwarp(); - } - if (tid == 0) { float diff = s_pe[0] / Na - g_pe_ref[bid]; g_energy_shift[bid] = diff; @@ -492,20 +478,13 @@ static __global__ void gpu_sum_pe_error( } __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_pe[tid] += s_pe[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_pe[tid] += s_pe[tid + offset]; - } - __syncwarp(); - } - if (tid == 0) { float diff = s_pe[0] / Na - g_pe_ref[bid] - energy_shift; error_gpu[bid] = diff * diff; @@ -589,7 +568,7 @@ static __global__ void gpu_sum_virial_error( } __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { for (int d = 0; d < 6; ++d) { s_virial[d * blockDim.x + tid] += s_virial[d * blockDim.x + tid + offset]; @@ -598,15 +577,6 @@ static __global__ void gpu_sum_virial_error( __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - for (int d = 0; d < 6; ++d) { - s_virial[d * blockDim.x + tid] += s_virial[d * blockDim.x + tid + offset]; - } - } - __syncwarp(); - } - if (tid == 0) { float error_sum = 0.0f; for (int d = 0; d < 6; ++d) { diff --git a/src/main_nep/snes.cu b/src/main_nep/snes.cu index 31f9c61f8..75a04df93 100644 --- a/src/main_nep/snes.cu +++ b/src/main_nep/snes.cu @@ -330,7 +330,7 @@ static __global__ void gpu_find_L1_L2_NEP4( } __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_cost_L1reg[tid] += s_cost_L1reg[tid + offset]; s_cost_L2reg[tid] += s_cost_L2reg[tid + offset]; @@ -338,14 +338,6 @@ static __global__ void gpu_find_L1_L2_NEP4( __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_cost_L1reg[tid] += s_cost_L1reg[tid + offset]; - s_cost_L2reg[tid] += s_cost_L2reg[tid + offset]; - } - __syncwarp(); - } - if (tid == 0) { gpu_cost_L1reg[bid] = s_cost_L1reg[0]; gpu_cost_L2reg[bid] = s_cost_L2reg[0]; @@ -406,7 +398,7 @@ static __global__ void gpu_find_L1_L2( } __syncthreads(); - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_cost_L1reg[tid] += s_cost_L1reg[tid + offset]; s_cost_L2reg[tid] += s_cost_L2reg[tid + offset]; @@ -414,14 +406,6 @@ static __global__ void gpu_find_L1_L2( __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_cost_L1reg[tid] += s_cost_L1reg[tid + offset]; - s_cost_L2reg[tid] += s_cost_L2reg[tid + offset]; - } - __syncwarp(); - } - if (tid == 0) { gpu_cost_L1reg[bid] = s_cost_L1reg[0]; gpu_cost_L2reg[bid] = s_cost_L2reg[0]; diff --git a/src/measure/dump_dipole.cu b/src/measure/dump_dipole.cu index 85b44efd3..a9f29f126 100644 --- a/src/measure/dump_dipole.cu +++ b/src/measure/dump_dipole.cu @@ -52,18 +52,12 @@ static __global__ void sum_dipole( // aggregate the patches in parallel #pragma unroll - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_d[tid] += s_d[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_d[tid] += s_d[tid + offset]; - } - __syncwarp(); - } // save the final value if (tid == 0) { diff --git a/src/measure/dump_polarizability.cu b/src/measure/dump_polarizability.cu index cb708ff1e..a601b180b 100644 --- a/src/measure/dump_polarizability.cu +++ b/src/measure/dump_polarizability.cu @@ -56,18 +56,12 @@ static __global__ void sum_polarizability( // aggregate the patches in parallel #pragma unroll - for (int offset = blockDim.x >> 1; offset > 32; offset >>= 1) { + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { if (tid < offset) { s_p[tid] += s_p[tid + offset]; } __syncthreads(); } - for (int offset = 32; offset > 0; offset >>= 1) { - if (tid < offset) { - s_p[tid] += s_p[tid + offset]; - } - __syncwarp(); - } // save the final value if (tid == 0) { diff --git a/tests/gpumd/carbon/thermo1.out b/tests/gpumd/carbon/thermo1.out index 61a0da2cf..05c89c9a8 100644 --- a/tests/gpumd/carbon/thermo1.out +++ b/tests/gpumd/carbon/thermo1.out @@ -1,10 +1,10 @@ - 3.0070239595e+02 2.4876054593e+03 -1.6374740626e+05 -4.9616458347e-01 -1.6912919950e-01 3.5673276051e-03 2.4386945299e-03 -2.6912030640e-01 1.0069715166e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 3.0028361711e+02 2.4841410488e+03 -1.6374398830e+05 -5.5494401174e-01 -8.7591048145e-02 -5.0432348237e-03 -2.3679409769e-02 -2.4354585861e-01 1.9885808946e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 3.0090760001e+02 2.4893030406e+03 -1.6374911269e+05 -4.8069691755e-01 -9.9976336074e-02 6.1171437172e-03 5.0698276414e-02 -2.6632055902e-01 1.7746335386e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 2.9928310328e+02 2.4758641489e+03 -1.6373553821e+05 -5.3300657578e-01 -8.9159184288e-02 1.7190277772e-02 -1.7661720635e-02 -2.8009973914e-01 1.3277642913e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 3.0056896760e+02 2.4865016534e+03 -1.6374628179e+05 -4.6676187670e-01 -1.9398202792e-01 6.0657253169e-02 2.4843680762e-02 -2.6412937631e-01 1.2556339662e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 3.0177745121e+02 2.4964990145e+03 -1.6375641970e+05 -5.7855570392e-01 -1.3026857442e-01 -2.4558633045e-02 3.1861800608e-02 -3.1819298120e-01 1.3576836171e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 2.9932973117e+02 2.4762498851e+03 -1.6373603799e+05 -5.4164328240e-01 -1.2870624839e-01 7.3976663441e-02 -3.7364902470e-03 -2.4645646405e-01 1.1570002894e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 3.0019241526e+02 2.4833865679e+03 -1.6374314826e+05 -5.7517205747e-01 -1.9081967489e-01 3.0024095278e-02 3.6496184974e-03 -2.9928554061e-01 1.2112666925e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 3.0102978417e+02 2.4903138273e+03 -1.6375013168e+05 -5.4688206625e-01 -1.4423854074e-01 -2.9289295474e-02 -6.9361765237e-02 -2.9886508948e-01 1.1227931829e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 - 3.0053797977e+02 2.4862453020e+03 -1.6374602890e+05 -4.7482065484e-01 -7.5619589181e-02 2.2007463507e-02 -5.4715753719e-02 -3.1145417781e-01 1.2338681735e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 3.0070239696e+02 2.4876054677e+03 -1.6374740681e+05 -4.9616458721e-01 -1.6912905599e-01 3.5674571088e-03 2.4386255848e-03 -2.6912030874e-01 1.0069714895e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 3.0028361454e+02 2.4841410276e+03 -1.6374398954e+05 -5.5494394538e-01 -8.7591274243e-02 -5.0432026470e-03 -2.3679457451e-02 -2.4354571568e-01 1.9885801505e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 3.0090759673e+02 2.4893030134e+03 -1.6374911288e+05 -4.8069681517e-01 -9.9975851141e-02 6.1175898070e-03 5.0698137114e-02 -2.6632069655e-01 1.7746338551e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 2.9928310219e+02 2.4758641398e+03 -1.6373553882e+05 -5.3300646757e-01 -8.9159458064e-02 1.7190636806e-02 -1.7661811588e-02 -2.8009998150e-01 1.3277622094e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 3.0056897132e+02 2.4865016842e+03 -1.6374628142e+05 -4.6676182293e-01 -1.9398195233e-01 6.0657145328e-02 2.4843735679e-02 -2.6412948829e-01 1.2556328772e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 3.0177745152e+02 2.4964990170e+03 -1.6375641977e+05 -5.7855531452e-01 -1.3026849337e-01 -2.4558613392e-02 3.1861445090e-02 -3.1819302526e-01 1.3576828877e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 2.9932973734e+02 2.4762499361e+03 -1.6373603818e+05 -5.4164297247e-01 -1.2870619654e-01 7.3976383615e-02 -3.7365440902e-03 -2.4645624599e-01 1.1570049569e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 3.0019240941e+02 2.4833865196e+03 -1.6374314724e+05 -5.7517215884e-01 -1.9081958194e-01 3.0024138347e-02 3.6494197887e-03 -2.9928541140e-01 1.2112686234e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 3.0102978268e+02 2.4903138149e+03 -1.6375013226e+05 -5.4688259664e-01 -1.4423825322e-01 -2.9289226679e-02 -6.9361762204e-02 -2.9886511551e-01 1.1227907599e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01 + 3.0053797919e+02 2.4862452971e+03 -1.6374602908e+05 -4.7481977198e-01 -7.5619344922e-02 2.2007601622e-02 -5.4715715225e-02 -3.1145404686e-01 1.2338681910e-01 7.5200000000e+01 7.5200000000e+01 7.5200000000e+01