ROCm: wip: clean up nearly all of the warnings when compiling for AMD…

… (issue #99)
hank-ai · Jan 13, 2025 · bf93109 · bf93109
1 parent 580b785
commit bf93109
Showing 17 changed files with 90 additions and 64 deletions.
diff --git a/CM_source.cmake b/CM_source.cmake
@@ -50,9 +50,13 @@ IF (UNIX)
 
 	ADD_COMPILE_OPTIONS (-Wall)					# enable "all" warnings
 	ADD_COMPILE_OPTIONS (-Wextra)				# enable even more warnings
-	ADD_COMPILE_OPTIONS (-Wshadow=local)		# enable warnings when duplicate variables are created with the same name
 	ADD_COMPILE_OPTIONS (-Wno-unused-parameter)	# don't report this error
 
+	IF (NOT DARKNET_USE_ROCM)
+		# The compiler we switch to when using ROCm/HIP complains about this option.
+		ADD_COMPILE_OPTIONS (-Wshadow=local)	# enable warnings when duplicate variables are created with the same name
+	ENDIF ()
+
 	IF (CMAKE_BUILD_TYPE MATCHES DEBUG OR
 		CMAKE_BUILD_TYPE MATCHES Debug OR
 		CMAKE_BUILD_TYPE MATCHES debug)

diff --git a/src-lib/activation_kernels.cu b/src-lib/activation_kernels.cu
@@ -118,6 +118,7 @@ __device__ float activate_kernel(float x, ACTIVATION a)
 			return hardtan_activate_kernel(x);
 		case LHTAN:
 			return lhtan_activate_kernel(x);
+		default: break;
 	}
 	return 0;
 }
@@ -159,6 +160,7 @@ __device__ float gradient_kernel(float x, ACTIVATION a)
 		return hardtan_gradient_kernel(x);
 	case LHTAN:
 		return lhtan_gradient_kernel(x);
+	default: break;
 	}
 	return 0;
 }
@@ -513,7 +515,7 @@ void activate_array_swish_ongpu(float *x, int n, float *output_sigmoid_gpu, floa
 {
 	TAT(TATPARMS);
 
-	const int num_blocks = get_number_of_blocks(n, BLOCK);
+//	const int num_blocks = get_number_of_blocks(n, BLOCK);
 	activate_array_swish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, output_sigmoid_gpu, output_gpu);
 	CHECK_CUDA(cudaPeekAtLastError());
 }
@@ -522,7 +524,7 @@ void activate_array_mish_ongpu(float *x, int n, float *activation_input_gpu, flo
 {
 	TAT(TATPARMS);
 
-	const int num_blocks = get_number_of_blocks(n, BLOCK);
+//	const int num_blocks = get_number_of_blocks(n, BLOCK);
 	activate_array_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, activation_input_gpu, output_gpu);
 	CHECK_CUDA(cudaPeekAtLastError());
 }
@@ -531,7 +533,7 @@ void activate_array_hard_mish_ongpu(float *x, int n, float *activation_input_gpu
 {
 	TAT(TATPARMS);
 
-	const int num_blocks = get_number_of_blocks(n, BLOCK);
+//	const int num_blocks = get_number_of_blocks(n, BLOCK);
 	activate_array_hard_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, activation_input_gpu, output_gpu);
 	CHECK_CUDA(cudaPeekAtLastError());
 }
@@ -566,7 +568,7 @@ void gradient_array_swish_ongpu(float *x, int n, float *sigmoid_gpu, float *delt
 {
 	TAT(TATPARMS);
 
-	const int num_blocks = get_number_of_blocks(n, BLOCK);
+//	const int num_blocks = get_number_of_blocks(n, BLOCK);
 	gradient_array_swish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (x, n, sigmoid_gpu, delta);
 	CHECK_CUDA(cudaPeekAtLastError());
 }
@@ -575,7 +577,7 @@ void gradient_array_mish_ongpu(int n, float *activation_input_gpu, float *delta)
 {
 	TAT(TATPARMS);
 
-	const int num_blocks = get_number_of_blocks(n, BLOCK);
+//	const int num_blocks = get_number_of_blocks(n, BLOCK);
 	gradient_array_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, activation_input_gpu, delta);
 	CHECK_CUDA(cudaPeekAtLastError());
 }
@@ -584,7 +586,7 @@ void gradient_array_hard_mish_ongpu(int n, float *activation_input_gpu, float *d
 {
 	TAT(TATPARMS);
 
-	const int num_blocks = get_number_of_blocks(n, BLOCK);
+//	const int num_blocks = get_number_of_blocks(n, BLOCK);
 	gradient_array_hard_mish_kernel <<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>> (n, activation_input_gpu, delta);
 	CHECK_CUDA(cudaPeekAtLastError());
 }

diff --git a/src-lib/blas_kernels.cu b/src-lib/blas_kernels.cu
@@ -1927,7 +1927,7 @@ __global__  void stretch_weights_kernel(const float *src_weight_gpu, float *weig
 			const int x_c = kernel_size / 2;
 			const int y_c = kernel_size / 2;
 
-			float dropout_sum = 0;
+//			float dropout_sum = 0;
 
 			for (int y = 0; y < kernel_size; ++y) {
 				for (int x = 0; x < kernel_size; ++x) {
@@ -1951,16 +1951,16 @@ __global__  void stretch_weights_kernel(const float *src_weight_gpu, float *weig
 
 					float val = 0;
 					if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
-					else dropout_sum += c_x_0 * c_y_0;
+//					else dropout_sum += c_x_0 * c_y_0;
 
 					if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
-					else dropout_sum += c_x_1 * c_y_0;
+//					else dropout_sum += c_x_1 * c_y_0;
 
 					if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
-					else dropout_sum += c_x_0 * c_y_1;
+//					else dropout_sum += c_x_0 * c_y_1;
 
 					if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
-					else dropout_sum += c_x_1 * c_y_1;
+//					else dropout_sum += c_x_1 * c_y_1;
 
 					weight_deform_gpu[x + y*kernel_size + i] = val;
 				}
@@ -2216,7 +2216,7 @@ __global__  void stretch_sway_flip_weights_kernel(const float *src_weight_gpu, f
 			const int x_c = kernel_size / 2;
 			const int y_c = kernel_size / 2;
 
-			float dropout_sum = 0;
+//			float dropout_sum = 0;
 
 			for (int y = 0; y < kernel_size; ++y) {
 				for (int x = 0; x < kernel_size; ++x) {
@@ -2240,16 +2240,16 @@ __global__  void stretch_sway_flip_weights_kernel(const float *src_weight_gpu, f
 
 					float val = 0;
 					if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
-					else dropout_sum += c_x_0 * c_y_0;
+//					else dropout_sum += c_x_0 * c_y_0;
 
 					if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
-					else dropout_sum += c_x_1 * c_y_0;
+//					else dropout_sum += c_x_1 * c_y_0;
 
 					if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
-					else dropout_sum += c_x_0 * c_y_1;
+//					else dropout_sum += c_x_0 * c_y_1;
 
 					if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
-					else dropout_sum += c_x_1 * c_y_1;
+//					else dropout_sum += c_x_1 * c_y_1;
 
 					weight_deform_gpu[x + y*kernel_size + i] = val;
 				}

diff --git a/src-lib/convolutional_kernels.cu b/src-lib/convolutional_kernels.cu
@@ -1207,8 +1207,8 @@ void assisted_excitation_forward_gpu(Darknet::Layer & l, Darknet::NetworkState s
 	if (0)   // visualize ground truth
 	{
 		cuda_pull_array(l.output_gpu, l.output, l.outputs * l.batch);
-		cudaStreamSynchronize(get_cuda_stream());
-		CHECK_CUDA(cudaPeekAtLastError());
+		CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+//		CHECK_CUDA(cudaPeekAtLastError());
 
 		for (b = 0; b < l.batch; ++b)
 		{
@@ -1262,7 +1262,7 @@ void pull_convolutional_layer(Darknet::Layer & l)
 		cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
 	}
 	CHECK_CUDA(cudaPeekAtLastError());
-	cudaStreamSynchronize(get_cuda_stream());
+	CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
 }
 
 void push_convolutional_layer(Darknet::Layer & l)

diff --git a/src-lib/crnn_layer.cpp b/src-lib/crnn_layer.cpp
@@ -124,7 +124,10 @@ void resize_crnn_layer(Darknet::Layer *l, int w, int h)
 	l->state = (float*)xrealloc(l->state, l->batch*l->hidden*(l->steps + 1)*sizeof(float));
 
 #ifdef DARKNET_GPU
-	if (l->state_gpu) cudaFree(l->state_gpu);
+	if (l->state_gpu)
+	{
+		CHECK_CUDA(cudaFree(l->state_gpu));
+	}
 	l->state_gpu = cuda_make_array(l->state, l->batch*l->hidden*(l->steps + 1));
 
 	l->output_gpu = l->output_layer->output_gpu;

diff --git a/src-lib/dark_cuda.cpp b/src-lib/dark_cuda.cpp
@@ -258,7 +258,7 @@ void cublas_check_error(cublasStatus_t status)
 #endif
 	if (cuda_debug_sync)
 	{
-		cudaDeviceSynchronize();
+		CHECK_CUDA(cudaDeviceSynchronize());
 	}
 	if (status != CUBLAS_STATUS_SUCCESS)
 	{
@@ -703,7 +703,7 @@ void cuda_pull_array(float *x_gpu, float *x, size_t n)
 	//printf("cuda_pull_array - get_cuda_stream() = %d \n", get_cuda_stream());
 	cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
 	CHECK_CUDA(status);
-	cudaStreamSynchronize(get_cuda_stream());
+	CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
 }
 
 void cuda_pull_array_async(float *x_gpu, float *x, size_t n)
@@ -806,7 +806,7 @@ void show_cuda_cudnn_info()
 		for (int idx = 0; idx < device_count; idx ++)
 		{
 			cudaDeviceProp prop;
-			cudaGetDeviceProperties(&prop, idx);
+			CHECK_CUDA(cudaGetDeviceProperties(&prop, idx));
 			std::cout
 				<< "=> " << idx << ": " << Darknet::in_colour(Darknet::EColour::kBrightGreen, prop.name)
 				<< " [#" << prop.major << "." << prop.minor << "]"

diff --git a/src-lib/darknet_cfg.cpp b/src-lib/darknet_cfg.cpp
@@ -1151,7 +1151,7 @@ Darknet::Network & Darknet::CfgFile::create_network(int batch, int time_steps)
 		}
 		else
 		{
-			cudaGetLastError(); // reset CUDA-error
+			std::ignore = cudaGetLastError(); // reset CUDA-error
 			net.input_pinned_cpu = (float*)xcalloc(size, sizeof(float));
 		}
 

diff --git a/src-lib/darknet_network.cpp b/src-lib/darknet_network.cpp
@@ -618,7 +618,7 @@ int resize_network(Darknet::Network * net, int w, int h)
 		{
 			if (net->input_pinned_cpu_flag)
 			{
-				cudaFreeHost(net->input_pinned_cpu);
+				CHECK_CUDA(cudaFreeHost(net->input_pinned_cpu));
 			}
 			else
 			{
@@ -1623,7 +1623,7 @@ void free_network(Darknet::Network & net)
 		// CPU
 		if (net.input_pinned_cpu_flag)
 		{
-			cudaFreeHost(net.input_pinned_cpu);
+			CHECK_CUDA(cudaFreeHost(net.input_pinned_cpu));
 		}
 		else
 		{

diff --git a/src-lib/dropout_layer.cpp b/src-lib/dropout_layer.cpp
@@ -67,8 +67,9 @@ void resize_dropout_layer(Darknet::Layer *l, int inputs)
 	cuda_free(l->rand_gpu);
 	l->rand_gpu = cuda_make_array(l->rand, l->inputs*l->batch);
 
-	if (l->dropblock) {
-		cudaFreeHost(l->drop_blocks_scale);
+	if (l->dropblock)
+	{
+		CHECK_CUDA(cudaFreeHost(l->drop_blocks_scale));
 		l->drop_blocks_scale = cuda_make_array_pinned(l->rand, l->batch);
 
 		cuda_free(l->drop_blocks_scale_gpu);

diff --git a/src-lib/dropout_layer_kernels.cu b/src-lib/dropout_layer_kernels.cu
@@ -128,11 +128,12 @@ void forward_dropout_layer_gpu(Darknet::Layer & l, Darknet::NetworkState state)
 		multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
 
 	// dropblock
-	if (l.dropblock) {
+	if (l.dropblock)
+	{
 		//l.probability = 1 / keep_prob
 		//const int max_blocks_per_channel = 10;
 		const float cur_prob = l.probability * multiplier;
-		const float cur_scale = 1.f / (1.f - cur_prob);
+//		const float cur_scale = 1.f / (1.f - cur_prob);
 
 		int block_width = l.dropblock_size_abs *multiplier;
 		int block_height = l.dropblock_size_abs *multiplier;
@@ -238,8 +239,8 @@ void backward_dropout_layer_gpu(Darknet::Layer & l, Darknet::NetworkState state)
 		if (iteration_num < (state.net.max_batches*0.85))
 			multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
 
-		const float cur_prob = l.probability * multiplier;
-		const float cur_scale = 1.f / (1.f - cur_prob);
+//		const float cur_prob = l.probability * multiplier;
+//		const float cur_scale = 1.f / (1.f - cur_prob);
 
 		int block_width = l.dropblock_size_abs * multiplier;
 		int block_height = l.dropblock_size_abs * multiplier;
@@ -255,8 +256,8 @@ void backward_dropout_layer_gpu(Darknet::Layer & l, Darknet::NetworkState state)
 		block_width = min_val_cmp(l.w, block_width);
 		block_height = min_val_cmp(l.h, block_height);
 
-		const int block_size = min_val_cmp(block_width, block_height);
-		const float block_prob = cur_prob / (block_size*block_size);
+//		const int block_size = min_val_cmp(block_width, block_height);
+//		const float block_prob = cur_prob / (block_size*block_size);
 
 		//fill_ongpu(l.outputs * l.batch, 1, state.delta, 1); // remove!!!
 

diff --git a/src-lib/gaussian_yolo_layer.cpp b/src-lib/gaussian_yolo_layer.cpp
@@ -414,14 +414,14 @@ Darknet::Layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int tota
 	free(l.output);
 	if (cudaSuccess == cudaHostAlloc((void**)&l.output, batch*l.outputs * sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1;
 	else {
-		cudaGetLastError(); // reset CUDA-error
+		std::ignore = cudaGetLastError(); // reset CUDA-error
 		l.output = (float*)calloc(batch * l.outputs, sizeof(float));
 	}
 
 	free(l.delta);
 	if (cudaSuccess == cudaHostAlloc((void**)&l.delta, batch*l.outputs * sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1;
 	else {
-		cudaGetLastError(); // reset CUDA-error
+		std::ignore = cudaGetLastError(); // reset CUDA-error
 		l.delta = (float*)calloc(batch * l.outputs, sizeof(float));
 	}
 
@@ -455,7 +455,7 @@ void resize_gaussian_yolo_layer(Darknet::Layer *l, int w, int h)
 	if (l->output_pinned) {
 		CHECK_CUDA(cudaFreeHost(l->output));
 		if (cudaSuccess != cudaHostAlloc((void**)&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
-			cudaGetLastError(); // reset CUDA-error
+			std::ignore = cudaGetLastError(); // reset CUDA-error
 			l->output = (float*)calloc(l->batch * l->outputs, sizeof(float));
 			l->output_pinned = 0;
 		}
@@ -464,7 +464,7 @@ void resize_gaussian_yolo_layer(Darknet::Layer *l, int w, int h)
 	if (l->delta_pinned) {
 		CHECK_CUDA(cudaFreeHost(l->delta));
 		if (cudaSuccess != cudaHostAlloc((void**)&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
-			cudaGetLastError(); // reset CUDA-error
+			std::ignore = cudaGetLastError(); // reset CUDA-error
 			l->delta = (float*)calloc(l->batch * l->outputs, sizeof(float));
 			l->delta_pinned = 0;
 		}

diff --git a/src-lib/gemm.cpp b/src-lib/gemm.cpp
@@ -2670,9 +2670,10 @@ void time_ongpu(int TA, int TB, int m, int k, int n)
 
 	int i;
 	clock_t start = clock(), end;
-	for(i = 0; i<iter; ++i){
+	for (i = 0; i<iter; ++i)
+	{
 		gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
-		cudaDeviceSynchronize();
+		CHECK_CUDA(cudaDeviceSynchronize());
 	}
 	double flop = ((double)m)*n*(2.*k + 2.)*iter;
 	double gflop = flop/pow(10., 9);

diff --git a/src-lib/im2col_kernels.cu b/src-lib/im2col_kernels.cu
@@ -62,6 +62,7 @@ __global__ void im2col_gpu_kernel(const int n, const float* data_im,
 				//data_im[(channel_in * height + h_in) * width + w_in + i * width + j];
 				//(*data_col_ptr) = data_im_ptr[ii * width + jj];
 
+				/// @todo V3 should this be "*data_col_ptr" instead?  Otherwise, what exactly are we incrementing here?  (answer: the pointer...but why?)
 				data_col_ptr += height_col * width_col;
 			}
 		}
@@ -126,6 +127,7 @@ __global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
 				int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index;// h_out * width_col + w_out;
 				data_col[out_index] = val;
 
+				/// @todo V3 should this be "*data_col_ptr" instead?  Otherwise, what exactly are we incrementing here?  (answer: the pointer...but why?)
 				data_col_ptr += bit_align;
 			}
 		}

diff --git a/src-lib/layer.cpp b/src-lib/layer.cpp
@@ -189,13 +189,13 @@ void free_layer_custom(Darknet::Layer & l, int keep_cudnn_desc)
 #ifdef DARKNET_GPU
 	if (l.delta && l.delta_pinned)
 	{
-		cudaFreeHost(l.delta);
+		CHECK_CUDA(cudaFreeHost(l.delta));
 		l.delta = nullptr;
 	}
 
 	if (l.output && l.output_pinned)
 	{
-		cudaFreeHost(l.output);
+		CHECK_CUDA(cudaFreeHost(l.output));
 		l.output = nullptr;
 	}
 #endif  // DARKNET_GPU

diff --git a/src-lib/network_kernels.cu b/src-lib/network_kernels.cu
@@ -85,7 +85,7 @@ void forward_network_gpu(Darknet::Network & net, Darknet::NetworkState state)
 
 		if(net.wait_stream)
 		{
-			cudaStreamSynchronize(get_cuda_stream());
+			CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
 		}
 		state.input = l.output_gpu;
 		//cudaDeviceSynchronize();
@@ -149,7 +149,7 @@ void backward_network_gpu(Darknet::Network & net, Darknet::NetworkState state)
 			avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
 			sorted_avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
 		}
-		cudaDeviceSynchronize();
+		CHECK_CUDA(cudaDeviceSynchronize());
 	}
 
 	state.workspace = net.workspace;

diff --git a/src-lib/yolo_layer.cpp b/src-lib/yolo_layer.cpp
@@ -443,7 +443,7 @@ Darknet::Layer make_yolo_layer(int batch, int w, int h, int n, int total, int *m
 	}
 	else
 	{
-		cudaGetLastError(); // reset CUDA-error
+		std::ignore = cudaGetLastError(); // reset CUDA-error
 		l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));
 	}
 
@@ -454,7 +454,7 @@ Darknet::Layer make_yolo_layer(int batch, int w, int h, int n, int total, int *m
 	}
 	else
 	{
-		cudaGetLastError(); // reset CUDA-error
+		std::ignore = cudaGetLastError(); // reset CUDA-error
 		l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));
 	}
 #endif
@@ -483,7 +483,7 @@ void resize_yolo_layer(Darknet::Layer * l, int w, int h)
 	if (l->output_pinned) {
 		CHECK_CUDA(cudaFreeHost(l->output));
 		if (cudaSuccess != cudaHostAlloc((void**)&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
-			cudaGetLastError(); // reset CUDA-error
+			std::ignore = cudaGetLastError(); // reset CUDA-error
 			l->output = (float*)xcalloc(l->batch * l->outputs, sizeof(float));
 			l->output_pinned = 0;
 		}
@@ -492,7 +492,7 @@ void resize_yolo_layer(Darknet::Layer * l, int w, int h)
 	if (l->delta_pinned) {
 		CHECK_CUDA(cudaFreeHost(l->delta));
 		if (cudaSuccess != cudaHostAlloc((void**)&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
-			cudaGetLastError(); // reset CUDA-error
+			std::ignore = cudaGetLastError(); // reset CUDA-error
 			l->delta = (float*)xcalloc(l->batch * l->outputs, sizeof(float));
 			l->delta_pinned = 0;
 		}

diff --git a/src-lib/yolo_v2_class.cpp b/src-lib/yolo_v2_class.cpp
@@ -71,7 +71,7 @@ int get_device_count()
 
 #ifdef DARKNET_GPU
 	int count = 0;
-	cudaGetDeviceCount(&count);
+	CHECK_CUDA(cudaGetDeviceCount(&count));
 	return count;
 #else
 	return -1;
@@ -114,7 +114,7 @@ int get_device_name(int gpu, char* deviceName)
 
 #ifdef DARKNET_GPU
 	cudaDeviceProp prop;
-	cudaGetDeviceProperties(&prop, gpu);
+	CHECK_CUDA(cudaGetDeviceProperties(&prop, gpu));
 	std::string result = prop.name;
 	std::copy(result.begin(), result.end(), deviceName);
 	return 1;
@@ -213,14 +213,14 @@ Detector::~Detector()
 
 #ifdef DARKNET_GPU
 	int old_gpu_index;
-	cudaGetDevice(&old_gpu_index);
+	CHECK_CUDA(cudaGetDevice(&old_gpu_index));
 	cuda_set_device(detector_gpu.net.gpu_index);
 #endif
 
 	free_network(detector_gpu.net);
 
 #ifdef DARKNET_GPU
-	cudaSetDevice(old_gpu_index);
+	CHECK_CUDA(cudaSetDevice(old_gpu_index));
 #endif
 }
 
@@ -291,9 +291,11 @@ std::vector<bbox_t> Detector::detect(image_t img, float thresh, bool use_mean)
 	Darknet::Network & net = detector_gpu.net;
 #ifdef DARKNET_GPU
 	int old_gpu_index;
-	cudaGetDevice(&old_gpu_index);
-	if(cur_gpu_id != old_gpu_index)
-		cudaSetDevice(net.gpu_index);
+	CHECK_CUDA(cudaGetDevice(&old_gpu_index));
+	if (cur_gpu_id != old_gpu_index)
+	{
+		CHECK_CUDA(cudaSetDevice(net.gpu_index));
+	}
 
 	net.wait_stream = wait_stream;    // 1 - wait CUDA-stream, 0 - not to wait
 #endif
@@ -370,7 +372,9 @@ std::vector<bbox_t> Detector::detect(image_t img, float thresh, bool use_mean)
 
 #ifdef DARKNET_GPU
 	if (cur_gpu_id != old_gpu_index)
-		cudaSetDevice(old_gpu_index);
+	{
+		CHECK_CUDA(cudaSetDevice(old_gpu_index));
+	}
 #endif
 
 	return bbox_vec;
@@ -384,9 +388,11 @@ std::vector<std::vector<bbox_t>> Detector::detectBatch(image_t img, int batch_si
 	Darknet::Network net = detector_gpu.net;
 #ifdef DARKNET_GPU
 	int old_gpu_index;
-	cudaGetDevice(&old_gpu_index);
-	if(cur_gpu_id != old_gpu_index)
-		cudaSetDevice(net.gpu_index);
+	CHECK_CUDA(cudaGetDevice(&old_gpu_index));
+	if (cur_gpu_id != old_gpu_index)
+	{
+		CHECK_CUDA(cudaSetDevice(net.gpu_index));
+	}
 
 	net.wait_stream = wait_stream;    // 1 - wait CUDA-stream, 0 - not to wait
 #endif
@@ -440,7 +446,9 @@ std::vector<std::vector<bbox_t>> Detector::detectBatch(image_t img, int batch_si
 
 #ifdef DARKNET_GPU
 	if (cur_gpu_id != old_gpu_index)
-		cudaSetDevice(old_gpu_index);
+	{
+		CHECK_CUDA(cudaSetDevice(old_gpu_index));
+	}
 #endif
 
 	return bbox_vec;
@@ -513,14 +521,18 @@ void *Detector::get_cuda_context()
 
 #ifdef DARKNET_GPU
 	int old_gpu_index;
-	cudaGetDevice(&old_gpu_index);
+	CHECK_CUDA(cudaGetDevice(&old_gpu_index));
 	if (cur_gpu_id != old_gpu_index)
-		cudaSetDevice(cur_gpu_id);
+	{
+		CHECK_CUDA(cudaSetDevice(cur_gpu_id));
+	}
 
 	void *cuda_context = cuda_get_context();
 
 	if (cur_gpu_id != old_gpu_index)
-		cudaSetDevice(old_gpu_index);
+	{
+		CHECK_CUDA(cudaSetDevice(old_gpu_index));
+	}
 
 	return cuda_context;
 #else   // DARKNET_GPU