From 7685f3c4746e88f2243b4d8610804f44a767237b Mon Sep 17 00:00:00 2001 From: Tri Dao Date: Sat, 14 Sep 2019 12:31:38 -0700 Subject: [PATCH] Delete old CUDA benchmark code --- .../butterfly_multiply_cuda.cu | 183 ------------------ 1 file changed, 183 deletions(-) diff --git a/butterfly/factor_multiply_fast/butterfly_multiply_cuda.cu b/butterfly/factor_multiply_fast/butterfly_multiply_cuda.cu index b6336a6..c8dc18c 100644 --- a/butterfly/factor_multiply_fast/butterfly_multiply_cuda.cu +++ b/butterfly/factor_multiply_fast/butterfly_multiply_cuda.cu @@ -1480,189 +1480,6 @@ void butterfly_odo_multiply_untied_forward_backward_fast_cuda(const at::Tensor & } #if BFLY_BENCHMARK -template -void butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt(int min_blocks_per_mp, - dim3 block, - dim3 grid, - const CudaAcsr twiddle_a, - InputReader input_reader, - OutputWriter output_writer, - int batch_size) { - auto stream = at::cuda::getCurrentCUDAStream(); - switch (min_blocks_per_mp) - { - #define CASE(x) case x: \ - butterfly_multiply_untied_forward_fast_cuda_kernel \ - <<>>(twiddle_a, input_reader, output_writer, batch_size); break; - MAP(CASE, 1, 2, 3, 4) - #undef CASE - } - TORCH_CHECK(cudaGetLastError() == cudaSuccess, - "butterfly_multiply_untied_forward_fast_cuda failed with error code ", - cudaGetLastError()); -} - -template -void butterfly_multiply_untied_forward_fast_cuda_benchmark_logn(int items_per_thread, - int min_blocks_per_mp, - dim3 block, - dim3 grid, - const CudaAcsr twiddle_a, - InputReader input_reader, - OutputWriter output_writer, - int batch_size) { - switch (items_per_thread) - { - case 1: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 2: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 3: - // butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - // (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 4: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 5: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 6: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 7: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 8: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 9: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 10: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 11: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 12: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 13: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 14: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 15: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - case 16: - butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt - (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - } -} - -void butterfly_multiply_untied_forward_fast_cuda_benchmark(const at::Tensor &twiddle, - const at::Tensor &input, - at::Tensor &output, - bool increasing_stride) { - int batch_size = input.size(0); - const int nstack = input.size(1); - const int n = input.size(2); - const int log_n = int(log2((double) n)); - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "butterfly_multiply_untied_forward_fast_cuda", [&] { - using accscalar_t = at::acc_type; - const auto twiddle_a = twiddle.packed_accessor(); - const InputReader input_reader(input); - OutputWriter output_writer(output); - dim3 block(min(n, MAX_BLOCK_SIZE)); - auto stream = at::cuda::getCurrentCUDAStream(); - switch (log_n) - { - // case 1: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<1, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<1, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 2: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<2, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<2, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 3: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<3, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<3, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 4: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<4, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<4, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 5: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<5, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<5, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 6: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<6, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<6, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 7: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<7, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<7, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 8: - // for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) { - // dim3 grid(div_up(batch_size, items_per_thread), 1, nstack); - // for (int min_blocks_per_mp: {1, 2}) { - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<8, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<8, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); - // } - // } - // break; - // case 9: - // for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) { - // dim3 grid(div_up(batch_size, items_per_thread), 1, nstack); - // for (int min_blocks_per_mp: {1, 2}) { - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<9, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<9, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); - // } - // } - // break; - // case 10: - // for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) { - // dim3 grid(div_up(batch_size, items_per_thread), 1, nstack); - // for (int min_blocks_per_mp: {1, 2, 3, 4}) { - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<10, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<10, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); - // } - // } - // break; - // case 11: - // for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) { - // dim3 grid(div_up(batch_size, items_per_thread), 1, nstack); - // for (int min_blocks_per_mp: {1, 2}) { - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<11, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<11, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); - // } - // } - // break; - // case 12: - // for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) { - // dim3 grid(div_up(batch_size, items_per_thread), 1, nstack); - // for (int min_blocks_per_mp: {1, 2}) { - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<12, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<12, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); - // } - // } - // break; - // case 11: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<11, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<11, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 12: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<12, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<12, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 13: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<13, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<13, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - // case 14: - // increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<14, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size) - // : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<14, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break; - } - }); -} - void butterfly_odo_multiply_untied_forward_fast_cuda_benchmark(const at::Tensor &twiddle_cos, const at::Tensor &twiddle_sin, const at::Tensor &diagonal,