Skip to content

Commit

Permalink
ROCm: wip: clean up nearly all of the warnings when compiling for AMD…
Browse files Browse the repository at this point in the history
… (issue #99)
stephanecharette committed Jan 13, 2025
1 parent 580b785 commit bf93109
Showing 17 changed files with 90 additions and 64 deletions.
6 changes: 5 additions & 1 deletion CM_source.cmake
Original file line number Diff line number Diff line change
@@ -50,9 +50,13 @@ IF (UNIX)

ADD_COMPILE_OPTIONS (-Wall) # enable "all" warnings
ADD_COMPILE_OPTIONS (-Wextra) # enable even more warnings
ADD_COMPILE_OPTIONS (-Wshadow=local) # enable warnings when duplicate variables are created with the same name
ADD_COMPILE_OPTIONS (-Wno-unused-parameter) # don't report this error

IF (NOT DARKNET_USE_ROCM)
# The compiler we switch to when using ROCm/HIP complains about this option.
ADD_COMPILE_OPTIONS (-Wshadow=local) # enable warnings when duplicate variables are created with the same name
ENDIF ()

IF (CMAKE_BUILD_TYPE MATCHES DEBUG OR
CMAKE_BUILD_TYPE MATCHES Debug OR
CMAKE_BUILD_TYPE MATCHES debug)
14 changes: 8 additions & 6 deletions src-lib/activation_kernels.cu
Original file line number Diff line number Diff line change
@@ -118,6 +118,7 @@ __device__ float activate_kernel(float x, ACTIVATION a)
return hardtan_activate_kernel(x);
case LHTAN:
return lhtan_activate_kernel(x);
default: break;
}
return 0;
}
@@ -159,6 +160,7 @@ __device__ float gradient_kernel(float x, ACTIVATION a)
return hardtan_gradient_kernel(x);
case LHTAN:
return lhtan_gradient_kernel(x);
default: break;
}
return 0;
}
@@ -513,7 +515,7 @@ void activate_array_swish_ongpu(float *x, int n, float *output_sigmoid_gpu, floa
{
TAT(TATPARMS);

const int num_blocks = get_number_of_blocks(n, BLOCK);
// const int num_blocks = get_number_of_blocks(n, BLOCK);
activate_array_swish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, output_sigmoid_gpu, output_gpu);
CHECK_CUDA(cudaPeekAtLastError());
}
@@ -522,7 +524,7 @@ void activate_array_mish_ongpu(float *x, int n, float *activation_input_gpu, flo
{
TAT(TATPARMS);

const int num_blocks = get_number_of_blocks(n, BLOCK);
// const int num_blocks = get_number_of_blocks(n, BLOCK);
activate_array_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, activation_input_gpu, output_gpu);
CHECK_CUDA(cudaPeekAtLastError());
}
@@ -531,7 +533,7 @@ void activate_array_hard_mish_ongpu(float *x, int n, float *activation_input_gpu
{
TAT(TATPARMS);

const int num_blocks = get_number_of_blocks(n, BLOCK);
// const int num_blocks = get_number_of_blocks(n, BLOCK);
activate_array_hard_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, activation_input_gpu, output_gpu);
CHECK_CUDA(cudaPeekAtLastError());
}
@@ -566,7 +568,7 @@ void gradient_array_swish_ongpu(float *x, int n, float *sigmoid_gpu, float *delt
{
TAT(TATPARMS);

const int num_blocks = get_number_of_blocks(n, BLOCK);
// const int num_blocks = get_number_of_blocks(n, BLOCK);
gradient_array_swish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (x, n, sigmoid_gpu, delta);
CHECK_CUDA(cudaPeekAtLastError());
}
@@ -575,7 +577,7 @@ void gradient_array_mish_ongpu(int n, float *activation_input_gpu, float *delta)
{
TAT(TATPARMS);

const int num_blocks = get_number_of_blocks(n, BLOCK);
// const int num_blocks = get_number_of_blocks(n, BLOCK);
gradient_array_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, activation_input_gpu, delta);
CHECK_CUDA(cudaPeekAtLastError());
}
@@ -584,7 +586,7 @@ void gradient_array_hard_mish_ongpu(int n, float *activation_input_gpu, float *d
{
TAT(TATPARMS);

const int num_blocks = get_number_of_blocks(n, BLOCK);
// const int num_blocks = get_number_of_blocks(n, BLOCK);
gradient_array_hard_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, activation_input_gpu, delta);
CHECK_CUDA(cudaPeekAtLastError());
}
20 changes: 10 additions & 10 deletions src-lib/blas_kernels.cu
Original file line number Diff line number Diff line change
@@ -1927,7 +1927,7 @@ __global__ void stretch_weights_kernel(const float *src_weight_gpu, float *weig
const int x_c = kernel_size / 2;
const int y_c = kernel_size / 2;

float dropout_sum = 0;
// float dropout_sum = 0;

for (int y = 0; y < kernel_size; ++y) {
for (int x = 0; x < kernel_size; ++x) {
@@ -1951,16 +1951,16 @@ __global__ void stretch_weights_kernel(const float *src_weight_gpu, float *weig

float val = 0;
if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
else dropout_sum += c_x_0 * c_y_0;
// else dropout_sum += c_x_0 * c_y_0;

if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
else dropout_sum += c_x_1 * c_y_0;
// else dropout_sum += c_x_1 * c_y_0;

if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
else dropout_sum += c_x_0 * c_y_1;
// else dropout_sum += c_x_0 * c_y_1;

if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
else dropout_sum += c_x_1 * c_y_1;
// else dropout_sum += c_x_1 * c_y_1;

weight_deform_gpu[x + y*kernel_size + i] = val;
}
@@ -2216,7 +2216,7 @@ __global__ void stretch_sway_flip_weights_kernel(const float *src_weight_gpu, f
const int x_c = kernel_size / 2;
const int y_c = kernel_size / 2;

float dropout_sum = 0;
// float dropout_sum = 0;

for (int y = 0; y < kernel_size; ++y) {
for (int x = 0; x < kernel_size; ++x) {
@@ -2240,16 +2240,16 @@ __global__ void stretch_sway_flip_weights_kernel(const float *src_weight_gpu, f

float val = 0;
if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
else dropout_sum += c_x_0 * c_y_0;
// else dropout_sum += c_x_0 * c_y_0;

if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
else dropout_sum += c_x_1 * c_y_0;
// else dropout_sum += c_x_1 * c_y_0;

if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
else dropout_sum += c_x_0 * c_y_1;
// else dropout_sum += c_x_0 * c_y_1;

if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
else dropout_sum += c_x_1 * c_y_1;
// else dropout_sum += c_x_1 * c_y_1;

weight_deform_gpu[x + y*kernel_size + i] = val;
}
6 changes: 3 additions & 3 deletions src-lib/convolutional_kernels.cu
Original file line number Diff line number Diff line change
@@ -1207,8 +1207,8 @@ void assisted_excitation_forward_gpu(Darknet::Layer & l, Darknet::NetworkState s
if (0) // visualize ground truth
{
cuda_pull_array(l.output_gpu, l.output, l.outputs * l.batch);
cudaStreamSynchronize(get_cuda_stream());
CHECK_CUDA(cudaPeekAtLastError());
CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
// CHECK_CUDA(cudaPeekAtLastError());

for (b = 0; b < l.batch; ++b)
{
@@ -1262,7 +1262,7 @@ void pull_convolutional_layer(Darknet::Layer & l)
cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
}
CHECK_CUDA(cudaPeekAtLastError());
cudaStreamSynchronize(get_cuda_stream());
CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
}

void push_convolutional_layer(Darknet::Layer & l)
5 changes: 4 additions & 1 deletion src-lib/crnn_layer.cpp
Original file line number Diff line number Diff line change
@@ -124,7 +124,10 @@ void resize_crnn_layer(Darknet::Layer *l, int w, int h)
l->state = (float*)xrealloc(l->state, l->batch*l->hidden*(l->steps + 1)*sizeof(float));

#ifdef DARKNET_GPU
if (l->state_gpu) cudaFree(l->state_gpu);
if (l->state_gpu)
{
CHECK_CUDA(cudaFree(l->state_gpu));
}
l->state_gpu = cuda_make_array(l->state, l->batch*l->hidden*(l->steps + 1));

l->output_gpu = l->output_layer->output_gpu;
6 changes: 3 additions & 3 deletions src-lib/dark_cuda.cpp
Original file line number Diff line number Diff line change
@@ -258,7 +258,7 @@ void cublas_check_error(cublasStatus_t status)
#endif
if (cuda_debug_sync)
{
cudaDeviceSynchronize();
CHECK_CUDA(cudaDeviceSynchronize());
}
if (status != CUBLAS_STATUS_SUCCESS)
{
@@ -703,7 +703,7 @@ void cuda_pull_array(float *x_gpu, float *x, size_t n)
//printf("cuda_pull_array - get_cuda_stream() = %d \n", get_cuda_stream());
cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
CHECK_CUDA(status);
cudaStreamSynchronize(get_cuda_stream());
CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
}

void cuda_pull_array_async(float *x_gpu, float *x, size_t n)
@@ -806,7 +806,7 @@ void show_cuda_cudnn_info()
for (int idx = 0; idx < device_count; idx ++)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, idx);
CHECK_CUDA(cudaGetDeviceProperties(&prop, idx));
std::cout
<< "=> " << idx << ": " << Darknet::in_colour(Darknet::EColour::kBrightGreen, prop.name)
<< " [#" << prop.major << "." << prop.minor << "]"
2 changes: 1 addition & 1 deletion src-lib/darknet_cfg.cpp
Original file line number Diff line number Diff line change
@@ -1151,7 +1151,7 @@ Darknet::Network & Darknet::CfgFile::create_network(int batch, int time_steps)
}
else
{
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
net.input_pinned_cpu = (float*)xcalloc(size, sizeof(float));
}

4 changes: 2 additions & 2 deletions src-lib/darknet_network.cpp
Original file line number Diff line number Diff line change
@@ -618,7 +618,7 @@ int resize_network(Darknet::Network * net, int w, int h)
{
if (net->input_pinned_cpu_flag)
{
cudaFreeHost(net->input_pinned_cpu);
CHECK_CUDA(cudaFreeHost(net->input_pinned_cpu));
}
else
{
@@ -1623,7 +1623,7 @@ void free_network(Darknet::Network & net)
// CPU
if (net.input_pinned_cpu_flag)
{
cudaFreeHost(net.input_pinned_cpu);
CHECK_CUDA(cudaFreeHost(net.input_pinned_cpu));
}
else
{
5 changes: 3 additions & 2 deletions src-lib/dropout_layer.cpp
Original file line number Diff line number Diff line change
@@ -67,8 +67,9 @@ void resize_dropout_layer(Darknet::Layer *l, int inputs)
cuda_free(l->rand_gpu);
l->rand_gpu = cuda_make_array(l->rand, l->inputs*l->batch);

if (l->dropblock) {
cudaFreeHost(l->drop_blocks_scale);
if (l->dropblock)
{
CHECK_CUDA(cudaFreeHost(l->drop_blocks_scale));
l->drop_blocks_scale = cuda_make_array_pinned(l->rand, l->batch);

cuda_free(l->drop_blocks_scale_gpu);
13 changes: 7 additions & 6 deletions src-lib/dropout_layer_kernels.cu
Original file line number Diff line number Diff line change
@@ -128,11 +128,12 @@ void forward_dropout_layer_gpu(Darknet::Layer & l, Darknet::NetworkState state)
multiplier = (iteration_num / (float)(state.net.max_batches*0.85));

// dropblock
if (l.dropblock) {
if (l.dropblock)
{
//l.probability = 1 / keep_prob
//const int max_blocks_per_channel = 10;
const float cur_prob = l.probability * multiplier;
const float cur_scale = 1.f / (1.f - cur_prob);
// const float cur_scale = 1.f / (1.f - cur_prob);

int block_width = l.dropblock_size_abs *multiplier;
int block_height = l.dropblock_size_abs *multiplier;
@@ -238,8 +239,8 @@ void backward_dropout_layer_gpu(Darknet::Layer & l, Darknet::NetworkState state)
if (iteration_num < (state.net.max_batches*0.85))
multiplier = (iteration_num / (float)(state.net.max_batches*0.85));

const float cur_prob = l.probability * multiplier;
const float cur_scale = 1.f / (1.f - cur_prob);
// const float cur_prob = l.probability * multiplier;
// const float cur_scale = 1.f / (1.f - cur_prob);

int block_width = l.dropblock_size_abs * multiplier;
int block_height = l.dropblock_size_abs * multiplier;
@@ -255,8 +256,8 @@ void backward_dropout_layer_gpu(Darknet::Layer & l, Darknet::NetworkState state)
block_width = min_val_cmp(l.w, block_width);
block_height = min_val_cmp(l.h, block_height);

const int block_size = min_val_cmp(block_width, block_height);
const float block_prob = cur_prob / (block_size*block_size);
// const int block_size = min_val_cmp(block_width, block_height);
// const float block_prob = cur_prob / (block_size*block_size);

//fill_ongpu(l.outputs * l.batch, 1, state.delta, 1); // remove!!!

8 changes: 4 additions & 4 deletions src-lib/gaussian_yolo_layer.cpp
Original file line number Diff line number Diff line change
@@ -414,14 +414,14 @@ Darknet::Layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int tota
free(l.output);
if (cudaSuccess == cudaHostAlloc((void**)&l.output, batch*l.outputs * sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1;
else {
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
l.output = (float*)calloc(batch * l.outputs, sizeof(float));
}

free(l.delta);
if (cudaSuccess == cudaHostAlloc((void**)&l.delta, batch*l.outputs * sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1;
else {
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
l.delta = (float*)calloc(batch * l.outputs, sizeof(float));
}

@@ -455,7 +455,7 @@ void resize_gaussian_yolo_layer(Darknet::Layer *l, int w, int h)
if (l->output_pinned) {
CHECK_CUDA(cudaFreeHost(l->output));
if (cudaSuccess != cudaHostAlloc((void**)&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
l->output = (float*)calloc(l->batch * l->outputs, sizeof(float));
l->output_pinned = 0;
}
@@ -464,7 +464,7 @@ void resize_gaussian_yolo_layer(Darknet::Layer *l, int w, int h)
if (l->delta_pinned) {
CHECK_CUDA(cudaFreeHost(l->delta));
if (cudaSuccess != cudaHostAlloc((void**)&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
l->delta = (float*)calloc(l->batch * l->outputs, sizeof(float));
l->delta_pinned = 0;
}
5 changes: 3 additions & 2 deletions src-lib/gemm.cpp
Original file line number Diff line number Diff line change
@@ -2670,9 +2670,10 @@ void time_ongpu(int TA, int TB, int m, int k, int n)

int i;
clock_t start = clock(), end;
for(i = 0; i<iter; ++i){
for (i = 0; i<iter; ++i)
{
gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
cudaDeviceSynchronize();
CHECK_CUDA(cudaDeviceSynchronize());
}
double flop = ((double)m)*n*(2.*k + 2.)*iter;
double gflop = flop/pow(10., 9);
2 changes: 2 additions & 0 deletions src-lib/im2col_kernels.cu
Original file line number Diff line number Diff line change
@@ -62,6 +62,7 @@ __global__ void im2col_gpu_kernel(const int n, const float* data_im,
//data_im[(channel_in * height + h_in) * width + w_in + i * width + j];
//(*data_col_ptr) = data_im_ptr[ii * width + jj];

/// @todo V3 should this be "*data_col_ptr" instead? Otherwise, what exactly are we incrementing here? (answer: the pointer...but why?)
data_col_ptr += height_col * width_col;
}
}
@@ -126,6 +127,7 @@ __global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index;// h_out * width_col + w_out;
data_col[out_index] = val;

/// @todo V3 should this be "*data_col_ptr" instead? Otherwise, what exactly are we incrementing here? (answer: the pointer...but why?)
data_col_ptr += bit_align;
}
}
4 changes: 2 additions & 2 deletions src-lib/layer.cpp
Original file line number Diff line number Diff line change
@@ -189,13 +189,13 @@ void free_layer_custom(Darknet::Layer & l, int keep_cudnn_desc)
#ifdef DARKNET_GPU
if (l.delta && l.delta_pinned)
{
cudaFreeHost(l.delta);
CHECK_CUDA(cudaFreeHost(l.delta));
l.delta = nullptr;
}

if (l.output && l.output_pinned)
{
cudaFreeHost(l.output);
CHECK_CUDA(cudaFreeHost(l.output));
l.output = nullptr;
}
#endif // DARKNET_GPU
4 changes: 2 additions & 2 deletions src-lib/network_kernels.cu
Original file line number Diff line number Diff line change
@@ -85,7 +85,7 @@ void forward_network_gpu(Darknet::Network & net, Darknet::NetworkState state)

if(net.wait_stream)
{
cudaStreamSynchronize(get_cuda_stream());
CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
}
state.input = l.output_gpu;
//cudaDeviceSynchronize();
@@ -149,7 +149,7 @@ void backward_network_gpu(Darknet::Network & net, Darknet::NetworkState state)
avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
sorted_avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
}
cudaDeviceSynchronize();
CHECK_CUDA(cudaDeviceSynchronize());
}

state.workspace = net.workspace;
8 changes: 4 additions & 4 deletions src-lib/yolo_layer.cpp
Original file line number Diff line number Diff line change
@@ -443,7 +443,7 @@ Darknet::Layer make_yolo_layer(int batch, int w, int h, int n, int total, int *m
}
else
{
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));
}

@@ -454,7 +454,7 @@ Darknet::Layer make_yolo_layer(int batch, int w, int h, int n, int total, int *m
}
else
{
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));
}
#endif
@@ -483,7 +483,7 @@ void resize_yolo_layer(Darknet::Layer * l, int w, int h)
if (l->output_pinned) {
CHECK_CUDA(cudaFreeHost(l->output));
if (cudaSuccess != cudaHostAlloc((void**)&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
l->output = (float*)xcalloc(l->batch * l->outputs, sizeof(float));
l->output_pinned = 0;
}
@@ -492,7 +492,7 @@ void resize_yolo_layer(Darknet::Layer * l, int w, int h)
if (l->delta_pinned) {
CHECK_CUDA(cudaFreeHost(l->delta));
if (cudaSuccess != cudaHostAlloc((void**)&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
cudaGetLastError(); // reset CUDA-error
std::ignore = cudaGetLastError(); // reset CUDA-error
l->delta = (float*)xcalloc(l->batch * l->outputs, sizeof(float));
l->delta_pinned = 0;
}
42 changes: 27 additions & 15 deletions src-lib/yolo_v2_class.cpp
Original file line number Diff line number Diff line change
@@ -71,7 +71,7 @@ int get_device_count()

#ifdef DARKNET_GPU
int count = 0;
cudaGetDeviceCount(&count);
CHECK_CUDA(cudaGetDeviceCount(&count));
return count;
#else
return -1;
@@ -114,7 +114,7 @@ int get_device_name(int gpu, char* deviceName)

#ifdef DARKNET_GPU
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, gpu);
CHECK_CUDA(cudaGetDeviceProperties(&prop, gpu));
std::string result = prop.name;
std::copy(result.begin(), result.end(), deviceName);
return 1;
@@ -213,14 +213,14 @@ Detector::~Detector()

#ifdef DARKNET_GPU
int old_gpu_index;
cudaGetDevice(&old_gpu_index);
CHECK_CUDA(cudaGetDevice(&old_gpu_index));
cuda_set_device(detector_gpu.net.gpu_index);
#endif

free_network(detector_gpu.net);

#ifdef DARKNET_GPU
cudaSetDevice(old_gpu_index);
CHECK_CUDA(cudaSetDevice(old_gpu_index));
#endif
}

@@ -291,9 +291,11 @@ std::vector<bbox_t> Detector::detect(image_t img, float thresh, bool use_mean)
Darknet::Network & net = detector_gpu.net;
#ifdef DARKNET_GPU
int old_gpu_index;
cudaGetDevice(&old_gpu_index);
if(cur_gpu_id != old_gpu_index)
cudaSetDevice(net.gpu_index);
CHECK_CUDA(cudaGetDevice(&old_gpu_index));
if (cur_gpu_id != old_gpu_index)
{
CHECK_CUDA(cudaSetDevice(net.gpu_index));
}

net.wait_stream = wait_stream; // 1 - wait CUDA-stream, 0 - not to wait
#endif
@@ -370,7 +372,9 @@ std::vector<bbox_t> Detector::detect(image_t img, float thresh, bool use_mean)

#ifdef DARKNET_GPU
if (cur_gpu_id != old_gpu_index)
cudaSetDevice(old_gpu_index);
{
CHECK_CUDA(cudaSetDevice(old_gpu_index));
}
#endif

return bbox_vec;
@@ -384,9 +388,11 @@ std::vector<std::vector<bbox_t>> Detector::detectBatch(image_t img, int batch_si
Darknet::Network net = detector_gpu.net;
#ifdef DARKNET_GPU
int old_gpu_index;
cudaGetDevice(&old_gpu_index);
if(cur_gpu_id != old_gpu_index)
cudaSetDevice(net.gpu_index);
CHECK_CUDA(cudaGetDevice(&old_gpu_index));
if (cur_gpu_id != old_gpu_index)
{
CHECK_CUDA(cudaSetDevice(net.gpu_index));
}

net.wait_stream = wait_stream; // 1 - wait CUDA-stream, 0 - not to wait
#endif
@@ -440,7 +446,9 @@ std::vector<std::vector<bbox_t>> Detector::detectBatch(image_t img, int batch_si

#ifdef DARKNET_GPU
if (cur_gpu_id != old_gpu_index)
cudaSetDevice(old_gpu_index);
{
CHECK_CUDA(cudaSetDevice(old_gpu_index));
}
#endif

return bbox_vec;
@@ -513,14 +521,18 @@ void *Detector::get_cuda_context()

#ifdef DARKNET_GPU
int old_gpu_index;
cudaGetDevice(&old_gpu_index);
CHECK_CUDA(cudaGetDevice(&old_gpu_index));
if (cur_gpu_id != old_gpu_index)
cudaSetDevice(cur_gpu_id);
{
CHECK_CUDA(cudaSetDevice(cur_gpu_id));
}

void *cuda_context = cuda_get_context();

if (cur_gpu_id != old_gpu_index)
cudaSetDevice(old_gpu_index);
{
CHECK_CUDA(cudaSetDevice(old_gpu_index));
}

return cuda_context;
#else // DARKNET_GPU

0 comments on commit bf93109

Please sign in to comment.