Skip to content

Commit

Permalink
Delete old CUDA benchmark code
Browse files Browse the repository at this point in the history
  • Loading branch information
tridao committed Sep 14, 2019
1 parent 9883865 commit 7685f3c
Showing 1 changed file with 0 additions and 183 deletions.
183 changes: 0 additions & 183 deletions butterfly/factor_multiply_fast/butterfly_multiply_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1480,189 +1480,6 @@ void butterfly_odo_multiply_untied_forward_backward_fast_cuda(const at::Tensor &
}

#if BFLY_BENCHMARK
template <int log_n, bool increasing_stride, int items_per_thread, typename scalar_t>
void butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt(int min_blocks_per_mp,
dim3 block,
dim3 grid,
const CudaAcsr<scalar_t, 4> twiddle_a,
InputReader<scalar_t> input_reader,
OutputWriter<scalar_t> output_writer,
int batch_size) {
auto stream = at::cuda::getCurrentCUDAStream();
switch (min_blocks_per_mp)
{
#define CASE(x) case x: \
butterfly_multiply_untied_forward_fast_cuda_kernel<log_n, increasing_stride, items_per_thread, x> \
<<<grid, block, 0, stream>>>(twiddle_a, input_reader, output_writer, batch_size); break;
MAP(CASE, 1, 2, 3, 4)
#undef CASE
}
TORCH_CHECK(cudaGetLastError() == cudaSuccess,
"butterfly_multiply_untied_forward_fast_cuda failed with error code ",
cudaGetLastError());
}

template <int log_n, bool increasing_stride, typename scalar_t>
void butterfly_multiply_untied_forward_fast_cuda_benchmark_logn(int items_per_thread,
int min_blocks_per_mp,
dim3 block,
dim3 grid,
const CudaAcsr<scalar_t, 4> twiddle_a,
InputReader<scalar_t> input_reader,
OutputWriter<scalar_t> output_writer,
int batch_size) {
switch (items_per_thread)
{
case 1:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 1>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 2:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 2>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 3:
// butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 3>
// (min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 4:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 4>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 5:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 5>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 6:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 6>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 7:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 7>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 8:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 8>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 9:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 9>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 10:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 10>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 11:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 11>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 12:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 12>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 13:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 13>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 14:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 14>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 15:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 15>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
case 16:
butterfly_multiply_untied_forward_fast_cuda_benchmark_logn_ipt<log_n, increasing_stride, 16>
(min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
}
}

void butterfly_multiply_untied_forward_fast_cuda_benchmark(const at::Tensor &twiddle,
const at::Tensor &input,
at::Tensor &output,
bool increasing_stride) {
int batch_size = input.size(0);
const int nstack = input.size(1);
const int n = input.size(2);
const int log_n = int(log2((double) n));
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "butterfly_multiply_untied_forward_fast_cuda", [&] {
using accscalar_t = at::acc_type<scalar_t, true>;
const auto twiddle_a = twiddle.packed_accessor<scalar_t, 4, at::RestrictPtrTraits, int32_t>();
const InputReader<scalar_t> input_reader(input);
OutputWriter<scalar_t> output_writer(output);
dim3 block(min(n, MAX_BLOCK_SIZE));
auto stream = at::cuda::getCurrentCUDAStream();
switch (log_n)
{
// case 1:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<1, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<1, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 2:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<2, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<2, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 3:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<3, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<3, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 4:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<4, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<4, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 5:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<5, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<5, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 6:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<6, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<6, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 7:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<7, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<7, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 8:
// for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) {
// dim3 grid(div_up(batch_size, items_per_thread), 1, nstack);
// for (int min_blocks_per_mp: {1, 2}) {
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<8, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<8, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size);
// }
// }
// break;
// case 9:
// for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) {
// dim3 grid(div_up(batch_size, items_per_thread), 1, nstack);
// for (int min_blocks_per_mp: {1, 2}) {
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<9, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<9, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size);
// }
// }
// break;
// case 10:
// for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) {
// dim3 grid(div_up(batch_size, items_per_thread), 1, nstack);
// for (int min_blocks_per_mp: {1, 2, 3, 4}) {
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<10, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<10, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size);
// }
// }
// break;
// case 11:
// for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) {
// dim3 grid(div_up(batch_size, items_per_thread), 1, nstack);
// for (int min_blocks_per_mp: {1, 2}) {
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<11, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<11, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size);
// }
// }
// break;
// case 12:
// for (int items_per_thread: {1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}) {
// dim3 grid(div_up(batch_size, items_per_thread), 1, nstack);
// for (int min_blocks_per_mp: {1, 2}) {
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<12, true>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<12, false>(items_per_thread, min_blocks_per_mp, block, grid, twiddle_a, input_reader, output_writer, batch_size);
// }
// }
// break;
// case 11:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<11, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<11, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 12:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<12, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<12, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 13:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<13, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<13, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
// case 14:
// increasing_stride ? butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<14, true>(block, grid, twiddle_a, input_reader, output_writer, batch_size)
// : butterfly_multiply_untied_forward_fast_cuda_benchmark_logn<14, false>(block, grid, twiddle_a, input_reader, output_writer, batch_size); break;
}
});
}

void butterfly_odo_multiply_untied_forward_fast_cuda_benchmark(const at::Tensor &twiddle_cos,
const at::Tensor &twiddle_sin,
const at::Tensor &diagonal,
Expand Down

0 comments on commit 7685f3c

Please sign in to comment.