From 662f4f81393280187ab8ca6abf16d169939c415e Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 26 Apr 2017 23:58:44 +0530 Subject: [PATCH] Merge branch 'amd-develop' into amd-master Change-Id: I51545bb66e9c9ff39bed86e2a8621e49a0e3f1c1 (cherry picked from commit cbb34eaf766e5e8374b1d514154be1d102b4115c) --- .vimrc | 1 - include/hip/hcc_detail/hip_runtime_api.h | 6 +- include/hip/nvcc_detail/hip_complex.h | 6 +- src/hip_hcc.cpp | 82 +++- src/hip_memory.cpp | 26 +- .../src/runtimeApi/memory/hipHostRegister.cpp | 143 ++++--- tests/src/runtimeApi/memory/hipMemcpy.cpp | 367 ++++++++++++++---- .../runtimeApi/memory/hipMemoryAllocate.cpp | 10 + 8 files changed, 500 insertions(+), 141 deletions(-) delete mode 100644 .vimrc diff --git a/.vimrc b/.vimrc deleted file mode 100644 index 019afa57e6..0000000000 --- a/.vimrc +++ /dev/null @@ -1 +0,0 @@ -:set makeprg=make\ -C\ build.hcc-LC.db diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index f9bfb5a310..7a99ff0810 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -853,7 +853,7 @@ hipError_t hipEventQuery(hipEvent_t event) ; * * @see hipGetDeviceCount, hipGetDevice, hipSetDevice, hipChooseDevice */ -hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr); +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void* ptr); /** * @brief Allocate memory on the default accelerator @@ -863,7 +863,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) * * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. * - * @return #hipSuccess + * @return #hipSuccess, #hipErrorMemoryAllocation, #hipErrorInvalidValue (bad context, null *ptr) * * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc */ @@ -1922,7 +1922,7 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); * @param [in] blockDimZ Z grid dimension specified in work-items * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. - * @param [in] kernelParams + * @param [in] kernelParams * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. * * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue diff --git a/include/hip/nvcc_detail/hip_complex.h b/include/hip/nvcc_detail/hip_complex.h index 84afb13e50..20cb24460c 100644 --- a/include/hip/nvcc_detail/hip_complex.h +++ b/include/hip/nvcc_detail/hip_complex.h @@ -64,7 +64,7 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi } __device__ __host__ static inline float hipCabsf(hipFloatComplex z){ - return cuCabsf(p, q); + return cuCabsf(z); } typedef cuDoubleComplex hipDoubleComplex; @@ -85,7 +85,7 @@ __device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ return cuConj(z); } -__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){ +__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){ return cuCabs(z) * cuCabs(z); } @@ -123,7 +123,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q return cuCfmaf(p, q, r); } -__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){ +__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){ return cuCfma(p, q, r); } diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 35a3e11e71..71d947488d 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1765,20 +1765,24 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, if (HIP_FORCE_P2P_HOST & 0x1) { *forceUnpinnedCopy = true; - tprintf (DB_COPY, "P2P. Copy engine (dev:%d agent=0x%lx) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + tprintf (DB_COPY, "Copy engine (dev:%d agent=0x%lx) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); } else { - tprintf (DB_COPY, "P2P. Copy engine (dev:%d agent=0x%lx) can see src and dst.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + tprintf (DB_COPY, "Copy engine (dev:%d agent=0x%lx) can see src and dst.\n", + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); } } else { *forceUnpinnedCopy = true; tprintf (DB_COPY, "P2P: Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); if (HIP_FAIL_SOC & 0x2) { fprintf (stderr, "HIP_FAIL_SOC: P2P: copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); throw ihipException(hipErrorRuntimeOther); } } @@ -1794,6 +1798,62 @@ void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const h } +// the pointer-info as returned by HC refers to the allocation +// This routine modifies the pointer-info so it appears to refer to the specific ptr and sizeBytes. +// TODO -remove this when HCC uses HSA pointer info functions directly. +void tailorPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) +{ + const char *ptrc = static_cast (ptr); + if (ptrInfo->_sizeBytes == 0) { + // invalid ptrInfo, don't modify + return; + } else if (ptrInfo->_isInDeviceMem) { + assert (ptrInfo->_devicePointer != nullptr); + std::ptrdiff_t diff = ptrc - static_cast (ptrInfo->_devicePointer); + + //TODO : assert-> runtime assert that only appears in debug mode + assert (diff >= 0); + assert (diff <= ptrInfo->_sizeBytes); + + ptrInfo->_devicePointer = const_cast (ptr); + + if (ptrInfo->_hostPointer != nullptr) { + ptrInfo->_hostPointer = static_cast(ptrInfo->_hostPointer) + diff; + } + + } else { + + assert (ptrInfo->_hostPointer != nullptr); + std::ptrdiff_t diff = ptrc - static_cast (ptrInfo->_hostPointer); + + //TODO : assert-> runtime assert that only appears in debug mode + assert (diff >= 0); + assert (diff <= ptrInfo->_sizeBytes); + + ptrInfo->_hostPointer = const_cast(ptr); + + if (ptrInfo->_devicePointer != nullptr) { + ptrInfo->_devicePointer = static_cast(ptrInfo->_devicePointer) + diff; + } + } + + assert (sizeBytes <= ptrInfo->_sizeBytes); + ptrInfo->_sizeBytes = sizeBytes; +}; + + +bool getTailoredPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) +{ + bool tracked = (hc::am_memtracker_getinfo(ptrInfo, ptr) == AM_SUCCESS); + + if (tracked) { + tailorPtrInfo(ptrInfo, ptr, sizeBytes); + } + + return tracked; +}; + + // TODO : For registered and host memory, if the portable flag is set, we need to recognize that and perform appropriate copy operation. // What can happen now is that Portable memory is mapped into multiple devices but Peer access is not enabled. i // The peer detection logic doesn't see that the memory is already mapped and so tries to use an unpinned copy algorithm. If this is PinInPlace, then an error can occur. @@ -1812,8 +1872,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); - bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); - bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here: @@ -1873,6 +1933,7 @@ void ihipStream_t::lockedSymbolCopySync(hc::accelerator &acc, void* dst, void* s void ihipStream_t::lockedSymbolCopyAsync(hc::accelerator &acc, void* dst, void* src, size_t sizeBytes, size_t offset, unsigned kind) { + // TODO - review - this looks broken , should not be adding pointers to tracker dynamically: if(kind == hipMemcpyHostToDevice) { hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); @@ -1899,6 +1960,7 @@ void ihipStream_t::lockedSymbolCopyAsync(hc::accelerator &acc, void* dst, void* } } + void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) { @@ -1926,8 +1988,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); - bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); - bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); hc::hcCommandKind hcCopyDir; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index da5530349f..821f64bc76 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -133,7 +133,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig //_appAllocationFlags : These are flags provided by the user when allocation is performed. They are returned to user in hipHostGetFlags and other APIs. // TODO - add more info here when available. // -hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void* ptr) { HIP_INIT_API(attributes, ptr); @@ -149,10 +149,10 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; if(attributes->memoryType == hipMemoryTypeHost){ - attributes->hostPointer = ptr; + attributes->hostPointer = (void*)ptr; } if(attributes->memoryType == hipMemoryTypeDevice){ - attributes->devicePointer = ptr; + attributes->devicePointer = (void*)ptr; } attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = amPointerInfo._appId; @@ -207,22 +207,26 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) HIP_INIT_API(ptr, sizeBytes); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; + + auto ctx = ihipGetTlsDefaultCtx(); // return NULL pointer when malloc size is 0 if (sizeBytes == 0) { *ptr = NULL; - return ihipLogStatus(hipSuccess); - } + hip_status = hipSuccess; - auto ctx = ihipGetTlsDefaultCtx(); + } else if ((ctx==nullptr) || (ptr == nullptr)) { + hip_status = hipErrorInvalidValue; - if (ctx) { + } else { auto device = ctx->getWriteableDevice(); *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); - } else { - hip_status = hipErrorMemoryAllocation; - } + if(sizeBytes && (*ptr == NULL)){ + hip_status = hipErrorMemoryAllocation; + } + + } return ihipLogStatus(hip_status); @@ -1268,7 +1272,7 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); if(hsa_status != HSA_STATUS_SUCCESS) hipStatus = hipErrorMapBufferObjectFailed; - } + } #else hipStatus = hipErrorRuntimeOther; #endif diff --git a/tests/src/runtimeApi/memory/hipHostRegister.cpp b/tests/src/runtimeApi/memory/hipHostRegister.cpp index 1a1319c500..8cf0979261 100644 --- a/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -19,87 +19,142 @@ THE SOFTWARE. /* HIT_START * BUILD: %t %s ../../test_common.cpp - * RUN: %t + * RUN: %t --tests 0x1 + * RUN: %t --tests 0x2 + * RUN: %t --tests 0x4 * HIT_END */ +// TODO - bug if run both back-to-back, once fixed should just need one command line + #include"test_common.h" #include __global__ void Inc(hipLaunchParm lp, float *Ad){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; -Ad[tx] = Ad[tx] + float(1); + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + Ad[tx] = Ad[tx] + float(1); +} + + +template +void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister) +{ + A = A + offset; + numElements -= offset; + + size_t sizeBytes = numElements * sizeof(T); + + if (internalRegister) { + HIPCHECK(hipHostRegister(A, sizeBytes, 0)); + } + + + // Reset + for(size_t i=0;iOFFSETS_TO_TRY); - HIPCHECK(hipMemcpy(Bh, Bd, size, hipMemcpyDeviceToHost)); + if (p_tests & 0x2) { + for (size_t i=0; i +class DeviceMemory +{ +public: + DeviceMemory(size_t numElements); + ~DeviceMemory(); + + T *A_d() const { return _A_d + _offset; }; + T *B_d() const { return _B_d + _offset; }; + T *C_d() const { return _C_d + _offset; }; + T *C_dd() const { return _C_dd + _offset; }; + + size_t maxNumElements() const { return _maxNumElements; }; + + + void offset(int offset) { _offset = offset; }; + int offset() const { return _offset; }; + +private: + T * _A_d; + T* _B_d; + T* _C_d; + T* _C_dd; + + + size_t _maxNumElements; + int _offset; +}; + +template +DeviceMemory::DeviceMemory(size_t numElements) + : _maxNumElements(numElements), + _offset(0) +{ + T ** np = nullptr; + HipTest::initArrays (&_A_d, &_B_d, &_C_d, np, np, np, numElements, 0); + + + size_t sizeElements = numElements * sizeof(T); + + + HIPCHECK ( hipMalloc(&_C_dd, sizeElements) ); +} + + +template +DeviceMemory::~DeviceMemory () +{ + T * np = nullptr; + HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0); + + HIPCHECK (hipFree(_C_dd)); + + _C_dd = NULL; +}; + + + +//------- +template +class HostMemory +{ +public: + HostMemory(size_t numElements, bool usePinnedHost); + void reset(size_t numElements, bool full=false) ; + ~HostMemory(); + + + T *A_h() const { return _A_h + _offset; }; + T *B_h() const { return _B_h + _offset; }; + T *C_h() const { return _C_h + _offset; }; + + + + size_t maxNumElements() const { return _maxNumElements; }; + + void offset(int offset) { _offset = offset; }; + int offset() const { return _offset; }; +public: + + // Host arrays, secondary copy + T * A_hh; + T* B_hh; + + bool _usePinnedHost; +private: + size_t _maxNumElements; + + int _offset; + + // Host arrays + T * _A_h; + T* _B_h; + T* _C_h; +}; + +template +HostMemory::HostMemory(size_t numElements, bool usePinnedHost) + : _maxNumElements(numElements), + _usePinnedHost(usePinnedHost), + _offset(0) +{ + T ** np = nullptr; + HipTest::initArrays (np, np, np, &_A_h, &_B_h, &_C_h, numElements, usePinnedHost); + + A_hh = NULL; + B_hh = NULL; + + + size_t sizeElements = numElements * sizeof(T); + + if (usePinnedHost) { + HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); + HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); + } else { + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); + } + +} + + +template +void +HostMemory::reset(size_t numElements, bool full) +{ + // Initialize the host data: + for (size_t i=0; i +HostMemory::~HostMemory () +{ + HipTest::freeArraysForHost (_A_h, _B_h, _C_h, _usePinnedHost); + + if (_usePinnedHost) { + HIPCHECK (hipHostFree(A_hh)); + HIPCHECK (hipHostFree(B_hh)); + + } else { + free(A_hh); + free(B_hh); + } + T *A_hh = NULL; + T *B_hh = NULL; + +}; @@ -52,71 +210,57 @@ void printSep() // IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. // template -void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); - + hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, + dmem->offset(), hmem->offset() + ); - T *A_d, *B_d, *C_d; - T *A_h, *B_h, *C_h; - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + hmem->reset(numElements); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - T *A_hh = NULL; - T *B_hh = NULL; - T *C_dd = NULL; + assert (numElements <= dmem->maxNumElements()); + assert (numElements <= hmem->maxNumElements()); if (useHostToHost) { - if (usePinnedHost) { - HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); - HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); - } else { - A_hh = (T*)malloc(sizeElements); - B_hh = (T*)malloc(sizeElements); - } - - // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + // Do an extra device-to-device copy here to mix things up: + HIPCHECK ( hipMemcpy(dmem->C_dd(), dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + //Destroy the original dmem->C_d(): + HIPCHECK ( hipMemset(dmem->C_d(), 0x5A, sizeElements)); - //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - - HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h(), dmem->C_dd(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h(), dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + HipTest::checkVectorADD(hmem->A_h(), hmem->B_h(), hmem->C_h(), numElements); + - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); } @@ -129,11 +273,15 @@ void memcpytest2_for_type(size_t numElements) { printSep(); + DeviceMemory memD(numElements); + HostMemory memU(numElements, 0/*usePinnedHost*/); + HostMemory memP(numElements, 1/*usePinnedHost*/); + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + memcpytest2(&memD, usePinnedHost ? &memP : &memU, numElements, useHostToHost, useDeviceToDevice, useMemkindDefault); } } } @@ -144,7 +292,7 @@ void memcpytest2_for_type(size_t numElements) //--- //Try many different sizes to memory copy. template -void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +void memcpytest2_sizes(size_t maxElem=0) { printSep(); printf ("test: %s<%s>\n", __func__, TYPENAME(T)); @@ -159,14 +307,68 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) maxElem = free/sizeof(T)/5; } - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", - deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); + HIPCHECK ( hipDeviceReset() ); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 1/*usePinnedHost*/); - for (size_t elem=64; elem+offset<=maxElem; elem*=2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + for (size_t elem=1; elem<=maxElem; elem*=2) { + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host + } +} + + +//--- +//Try many different sizes to memory copy. +template +void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, TYPENAME(T)); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); + HIPCHECK ( hipDeviceReset() ); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 1/*usePinnedHost*/); + + size_t elem = maxElem / 2; + + for (int offset=0; offset < 512; offset++) { + assert (elem + offset < maxElem); + if (devOffsets) { + memD.offset(offset); + } + if (hostOffsets) { + memU.offset(offset); + memP.offset(offset); + } + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host + } + + for (int offset=512; offset < elem; offset*=2) { + assert (elem + offset < maxElem); + if (devOffsets) { + memD.offset(offset); + } + if (hostOffsets) { + memU.offset(offset); + memP.offset(offset); + } + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } } @@ -178,13 +380,17 @@ void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + DeviceMemory memD(N); + HostMemory mem1(N, usePinnedHost); + HostMemory mem2(N, usePinnedHost); + + std::thread t1 (memcpytest2, &memD, &mem1, N, 0,0,0); if (serialize) { t1.join(); } - std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + std::thread t2 (memcpytest2,&memD, &mem2, N, 0,0,0); if (serialize) { t2.join(); } @@ -218,37 +424,39 @@ int main(int argc, char *argv[]) if (p_tests & 0x2) { - // Some tests around the 64MB boundary which have historically shown issues: - printf ("\n\n=== tests&0x2 (64MB boundary)\n"); -#if 0 + // Some tests around the 64KB boundary which have historically shown issues: + printf ("\n\n=== tests&0x2 (64KB boundary)\n"); + size_t maxElem = 32*1024*1024; + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(15*1024*1024, 1, 0, 0, 0); - memcpytest2(16*1024*1024, 1, 0, 0, 0); - memcpytest2(16*1024*1024+16*1024, 1, 0, 0, 0); -#endif + memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); + // Just over 64MB: - memcpytest2(16*1024*1024+512*1024, 1, 0, 0, 0); - memcpytest2(17*1024*1024+1024, 1, 0, 0, 0); - memcpytest2(32*1024*1024, 1, 0, 0, 0); - memcpytest2(32*1024*1024, 0, 0, 0, 0); - memcpytest2(32*1024*1024, 1, 1, 1, 0); - memcpytest2(32*1024*1024, 1, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + + } + if (p_tests & 0x4) { - printf ("\n\n=== tests&4 (test sizes and offsets)\n"); + printf ("\n\n=== tests&4 (test sizes)\n"); HIPCHECK ( hipDeviceReset() ); + memcpytest2_sizes(0); printSep(); - memcpytest2_sizes(0,0); - printSep(); - memcpytest2_sizes(0,64); - printSep(); - memcpytest2_sizes(1024*1024, 13); - printSep(); - memcpytest2_sizes(1024*1024, 50); } + + if (p_tests & 0x8) { printf ("\n\n=== tests&8\n"); HIPCHECK ( hipDeviceReset() ); @@ -270,6 +478,27 @@ int main(int argc, char *argv[]) } + if (p_tests & 0x10) { + printf ("\n\n=== tests&0x10 (test device offsets)\n"); + HIPCHECK ( hipDeviceReset() ); + size_t maxSize = 256*1024; + memcpytest2_offsets (maxSize, true, false); + memcpytest2_offsets (maxSize, true, false); + memcpytest2_offsets(maxSize, true, false); + } + + + if (p_tests & 0x20) { + printf ("\n\n=== tests&0x10 (test device offsets)\n"); + HIPCHECK ( hipDeviceReset() ); + size_t maxSize = 256*1024; + memcpytest2_offsets (maxSize, false, true); + memcpytest2_offsets (maxSize, false, true); + memcpytest2_offsets(maxSize, false, true); + } + + + passed(); } diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 1f7599491a..0a256d6362 100644 --- a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -56,5 +56,15 @@ int main(){ HIPCHECK_API(hipFree(NULL) , hipSuccess); HIPCHECK_API(hipHostFree(NULL) , hipSuccess); + + { + // Some negative testing - request a too-big allocation and verify it fails: + // Someday when we support virtual memory may need to refactor these: + size_t tooBig = 128LL*1024*1024*1024*1024; // 128 TB; + void *p; + HIPCHECK_API ( hipMalloc(&p, tooBig), hipErrorMemoryAllocation ); + HIPCHECK_API ( hipHostMalloc(&p, tooBig), hipErrorMemoryAllocation ); + } + passed(); }