caldgemm_cuda.h

/**
 * Interface of the CALDGEMM library.
 *
 * Copyright 2015:
 *  - David Rohr (drohr@jwdt.org)
 *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
 *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
 *
 * This file is part of CALDGEMM.
 *
 * CALDGEMM is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CALDGEMM is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef caldgemm_cuda_H
#define caldgemm_cuda_H

#include <cuda.h>
#include <cuda_runtime_api.h>
#ifdef CALDGEMM_CUDA_CUBLAS
#include <cublas_v2.h>
#endif
#include "caldgemm.h"

class caldgemm_cuda : public caldgemm
{
public:
	caldgemm_cuda();
	virtual ~caldgemm_cuda();

private:
	virtual int UseOutputPthreads();
	virtual int UseInputPthreads();
	virtual int UseMutexPerDevice();

	virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA);
	virtual	int Initialize (bool nocalinit);
	virtual int ValidateRuntime();
	virtual int CheckDevices();
	virtual int InitDevices();
	virtual int ReinitDevices();
	virtual int InitConstantData(double alpha);
	virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn);
	virtual int ExitRuntime();
	virtual int ExitDevices();
	virtual int WaitForEvent(int, int, int);
	virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0);
	virtual int CheckDMAQueue(int device, int forcej = -1);
	virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch);
	virtual int RunCALDGEMM_Init();
	virtual int RunCALDGEMM_Exit();

	virtual double* AllocMemory(size_t nDoubles, bool page_locked, bool huge_pages, bool gpuaccessible = false, bool interleave = false);
	virtual int FreeMemory(double* ptr, bool gpuaccessible = false);
	virtual int Preallocate();
	virtual int PreallocateFree();
	virtual int SimpleQueuingAvailable();
	
	void SetupSimpleQueue(size_t mb, size_t nb);
	struct caldgemm_cuda_simple_queue_event
	{
		cudaEvent_t event;
		int num_queue;
	};
	caldgemm_cuda_simple_queue_event* simple_queue_events[max_devices][2]; //2 for m and n direction
	bool* simple_queue_event_requested[max_devices][obuffercount][2];
	cudaEvent_t simple_queue_event_kernels[max_devices][ibuffercount][obuffercount];
	bool simple_queue_event_kernels_used[max_devices][ibuffercount][obuffercount];
	struct alternateSimpleQueueCBuffferEventStruct
	{
		cudaEvent_t event;
		bool used;
	};
	cudaEvent_t alternateSimpleQueueCopyCEvent[max_devices][obuffercount];
	alternateSimpleQueueCBuffferEventStruct alternateSimpleQueueCBuffferEvent[max_devices][obuffercount];
	cudaEvent_t alternateSimpleQueueEvent_tmp_abuffers[max_devices][obuffercount];
	cudaEvent_t alternateSimpleQueueEvent_tmp_bbuffers[max_devices][obuffercount];
	bool alternateSimpleQueueEvent_tmp_abuffers_used[max_devices][obuffercount];
	bool alternateSimpleQueueEvent_tmp_bbuffers_used[max_devices][obuffercount];
	cudaEvent_t alternateSimpleQueueTmpEvents[2];
	
	cudaEvent_t* AlternateLookaheadTilesRemainingSQ_events;
	virtual int CheckAlternateTilesRemainingSQ();
	qSem AlternateLookaheadDoneMutexSQ;

	int cuda_devices[max_devices];
	cudaStream_t cuda_command_queues[max_devices][obuffercount + 2];
	void* cuda_abuffers[max_devices][ibuffercount];
	void* cuda_bbuffers[max_devices][max_bbuffers];
	void* cuda_cbuffers[max_devices][obuffercount];
	void* cuda_tmp_abuffers[max_devices][obuffercount];
	void* cuda_tmp_bbuffers[max_devices][obuffercount];
	cudaEvent_t cuda_events[max_devices][obuffercount];
#ifdef CALDGEMM_CUDA_CUBLAS
        cublasHandle_t cublas_handles[max_devices];
#endif
	cudaEvent_t cuda_conversion_events[max_devices][2];
	int cuda_conversion_events_use[max_devices][2];

	int WaitForEventAndRelease(cudaEvent_t* pEvent);

	static const int GROUP_SIZE_X = 16, GROUP_SIZE_Y = 16, GROUP_COUNT_X = 16, GROUP_COUNT_Y = 16;	//Group and block size for conversion kernels and for DGEMM kernel
	
	struct conversionKernelTaskStruct
	{
		conversionKernelTaskStruct() {}
		conversionKernelTaskStruct(void* c1, void* c2, int c3, int c4, char c5) : dest_buffer_tmp(c1), dest_image(c2), arg_width(c3), arg_height(c4), myMat(c5) {}
		void* dest_buffer_tmp;
		void* dest_image;
		size_t arg_width;
		size_t arg_height;
		char myMat;
	};
};

#endif