From a3e04104ac5344a4ad7debb07eaa26d480a4b862 Mon Sep 17 00:00:00 2001 From: jiabaxie Date: Mon, 13 Nov 2023 20:52:07 -0500 Subject: [PATCH] SWDEV-427855, SWDEV-306642 - hip change for profiler and add texture/surface/device capabilities device struct entries Change-Id: Id992356b05931a1112fa7de7148efbb7b8b1bb15 --- include/hip/hip_deprecated.h | 94 +++++++++++++ include/hip/hip_runtime_api.h | 239 ++++++++++++++++++++++------------ 2 files changed, 251 insertions(+), 82 deletions(-) create mode 100644 include/hip/hip_deprecated.h diff --git a/include/hip/hip_deprecated.h b/include/hip/hip_deprecated.h new file mode 100644 index 0000000000..f6d5b0eb68 --- /dev/null +++ b/include/hip/hip_deprecated.h @@ -0,0 +1,94 @@ +#pragma once + +// This file will add older hip functions used in the versioning system +// Find the deprecated functions and structs in hip_device.cpp + +// This struct is also kept in hip_device.cpp +typedef struct hipDeviceProp_tR0000 { + char name[256]; ///< Device name. + size_t totalGlobalMem; ///< Size of global memory region (in bytes). + size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). + int regsPerBlock; ///< Registers per block. + int warpSize; ///< Warp size. + int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. + int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. + int maxGridSize[3]; ///< Max grid dimensions (XYZ). + int clockRate; ///< Max clock frequency of the multiProcessors in khz. + int memoryClockRate; ///< Max global memory clock frequency in khz. + int memoryBusWidth; ///< Global memory bus width in bits. + size_t totalConstMem; ///< Size of shared memory region (in bytes). + int major; ///< Major compute capability. On HCC, this is an approximation and features may + ///< differ from CUDA CC. See the arch feature flags for portable ways to query + ///< feature caps. + int minor; ///< Minor compute capability. On HCC, this is an approximation and features may + ///< differ from CUDA CC. See the arch feature flags for portable ways to query + ///< feature caps. + int multiProcessorCount; ///< Number of multi-processors (compute units). + int l2CacheSize; ///< L2 cache size. + int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. + int computeMode; ///< Compute mode. + int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" + ///< instructions. New for HIP. + hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. + int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. + int pciDomainID; ///< PCI Domain ID + int pciBusID; ///< PCI Bus ID. + int pciDeviceID; ///< PCI Device ID. + size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor. + int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not. + int canMapHostMemory; ///< Check whether HIP can map host memory + int gcnArch; ///< DEPRECATED: use gcnArchName instead + char gcnArchName[256]; ///< AMD GCN Arch Name. + int integrated; ///< APU vs dGPU + int cooperativeLaunch; ///< HIP device supports cooperative launch + int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple + ///< devices + int maxTexture1DLinear; ///< Maximum size for 1D textures bound to linear memory + int maxTexture1D; ///< Maximum number of elements in 1D images + int maxTexture2D[2]; ///< Maximum dimensions (width, height) of 2D images, in image elements + int maxTexture3D[3]; ///< Maximum dimensions (width, height, depth) of 3D images, in image + ///< elements + unsigned int* hdpMemFlushCntl; ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register + unsigned int* hdpRegFlushCntl; ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register + size_t memPitch; ///< Maximum pitch in bytes allowed by memory copies + size_t textureAlignment; ///< Alignment requirement for textures + size_t texturePitchAlignment; ///< Pitch alignment requirement for texture references bound to + ///< pitched memory + int kernelExecTimeoutEnabled; ///< Run time limit for kernels executed on the device + int ECCEnabled; ///< Device has ECC support enabled + int tccDriver; ///< 1:If device is Tesla device using TCC driver, else 0 + int cooperativeMultiDeviceUnmatchedFunc; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched functions + int cooperativeMultiDeviceUnmatchedGridDim; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched grid dimensions + int cooperativeMultiDeviceUnmatchedBlockDim; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched block dimensions + int cooperativeMultiDeviceUnmatchedSharedMem; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched shared memories + int isLargeBar; ///< 1: if it is a large PCI bar device, else 0 + int asicRevision; ///< Revision of the GPU in this device + int managedMemory; ///< Device supports allocating managed memory on this system + int directManagedMemAccessFromHost; ///< Host can directly access managed memory on the device + ///< without migration + int concurrentManagedAccess; ///< Device can coherently access managed memory concurrently with + ///< the CPU + int pageableMemoryAccess; ///< Device supports coherently accessing pageable memory + ///< without calling hipHostRegister on it + int pageableMemoryAccessUsesHostPageTables; ///< Device accesses pageable memory via the host's + ///< page tables +} hipDeviceProp_tR0000; + +#ifdef __cplusplus +extern "C" { +#endif + +hipError_t hipGetDevicePropertiesR0000(hipDeviceProp_tR0000* prop, int device); +hipError_t hipChooseDeviceR0000(int* device, const hipDeviceProp_tR0000* prop); + +#ifdef __cplusplus +} +#endif diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index 0eb81cb718..e072dc9a24 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -88,76 +88,151 @@ typedef struct hipUUID_t { //--- // Common headers for both NVCC and HCC paths: +#define hipGetDeviceProperties hipGetDevicePropertiesR0600 +#define hipDeviceProp_t hipDeviceProp_tR0600 +#define hipChooseDevice hipChooseDeviceR0600 + /** * hipDeviceProp * */ typedef struct hipDeviceProp_t { - char name[256]; ///< Device name. - size_t totalGlobalMem; ///< Size of global memory region (in bytes). - size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). - int regsPerBlock; ///< Registers per block. - int warpSize; ///< Warp size. - int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. - int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. - int maxGridSize[3]; ///< Max grid dimensions (XYZ). - int clockRate; ///< Max clock frequency of the multiProcessors in khz. - int memoryClockRate; ///< Max global memory clock frequency in khz. - int memoryBusWidth; ///< Global memory bus width in bits. - size_t totalConstMem; ///< Size of shared memory region (in bytes). + char name[256]; ///< Device name. + hipUUID uuid; ///< UUID of a device + char luid[8]; ///< 8-byte unique identifier. Only valid on windows + unsigned int luidDeviceNodeMask; ///< LUID node mask + size_t totalGlobalMem; ///< Size of global memory region (in bytes). + size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). + int regsPerBlock; ///< Registers per block. + int warpSize; ///< Warp size. + size_t memPitch; ///< Maximum pitch in bytes allowed by memory copies + ///< pitched memory + int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. + int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. + int maxGridSize[3]; ///< Max grid dimensions (XYZ). + int clockRate; ///< Max clock frequency of the multiProcessors in khz. + size_t totalConstMem; ///< Size of shared memory region (in bytes). int major; ///< Major compute capability. On HCC, this is an approximation and features may ///< differ from CUDA CC. See the arch feature flags for portable ways to query ///< feature caps. int minor; ///< Minor compute capability. On HCC, this is an approximation and features may ///< differ from CUDA CC. See the arch feature flags for portable ways to query ///< feature caps. - int multiProcessorCount; ///< Number of multi-processors (compute units). - int l2CacheSize; ///< L2 cache size. - int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. - int computeMode; ///< Compute mode. + size_t textureAlignment; ///< Alignment requirement for textures + size_t texturePitchAlignment; ///< Pitch alignment requirement for texture references bound to + int deviceOverlap; ///< Deprecated. Use asyncEngineCount instead + int multiProcessorCount; ///< Number of multi-processors (compute units). + int kernelExecTimeoutEnabled; ///< Run time limit for kernels executed on the device + int integrated; ///< APU vs dGPU + int canMapHostMemory; ///< Check whether HIP can map host memory + int computeMode; ///< Compute mode. + int maxTexture1D; ///< Maximum number of elements in 1D images + int maxTexture1DMipmap; ///< Maximum 1D mipmap texture size + int maxTexture1DLinear; ///< Maximum size for 1D textures bound to linear memory + int maxTexture2D[2]; ///< Maximum dimensions (width, height) of 2D images, in image elements + int maxTexture2DMipmap[2]; ///< Maximum number of elements in 2D array mipmap of images + int maxTexture2DLinear[3]; ///< Maximum 2D tex dimensions if tex are bound to pitched memory + int maxTexture2DGather[2]; ///< Maximum 2D tex dimensions if gather has to be performed + int maxTexture3D[3]; ///< Maximum dimensions (width, height, depth) of 3D images, in image + ///< elements + int maxTexture3DAlt[3]; ///< Maximum alternate 3D texture dims + int maxTextureCubemap; ///< Maximum cubemap texture dims + int maxTexture1DLayered[2]; ///< Maximum number of elements in 1D array images + int maxTexture2DLayered[3]; ///< Maximum number of elements in 2D array images + int maxTextureCubemapLayered[2]; ///< Maximum cubemaps layered texture dims + int maxSurface1D; ///< Maximum 1D surface size + int maxSurface2D[2]; ///< Maximum 2D surface size + int maxSurface3D[3]; ///< Maximum 3D surface size + int maxSurface1DLayered[2]; ///< Maximum 1D layered surface size + int maxSurface2DLayered[3]; ///< Maximum 2D layared surface size + int maxSurfaceCubemap; ///< Maximum cubemap surface size + int maxSurfaceCubemapLayered[2]; ///< Maximum cubemap layered surface size + size_t surfaceAlignment; ///< Alignment requirement for surface + int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. + int ECCEnabled; ///< Device has ECC support enabled + int pciBusID; ///< PCI Bus ID. + int pciDeviceID; ///< PCI Device ID. + int pciDomainID; ///< PCI Domain ID + int tccDriver; ///< 1:If device is Tesla device using TCC driver, else 0 + int asyncEngineCount; ///< Number of async engines + int unifiedAddressing; ///< Does device and host share unified address space + int memoryClockRate; ///< Max global memory clock frequency in khz. + int memoryBusWidth; ///< Global memory bus width in bits. + int l2CacheSize; ///< L2 cache size. + int persistingL2CacheMaxSize; ///< Device's max L2 persisting lines in bytes + int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. + int streamPrioritiesSupported; ///< Device supports stream priority + int globalL1CacheSupported; ///< Indicates globals are cached in L1 + int localL1CacheSupported; ///< Locals are cahced in L1 + size_t sharedMemPerMultiprocessor; ///< Amount of shared memory available per multiprocessor. + int regsPerMultiprocessor; ///< registers available per multiprocessor + int managedMemory; ///< Device supports allocating managed memory on this system + int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not. + int multiGpuBoardGroupID; ///< Unique identifier for a group of devices on same multiboard GPU + int hostNativeAtomicSupported; ///< Link between host and device supports native atomics + int singleToDoublePrecisionPerfRatio; ///< Deprecated. CUDA only. + int pageableMemoryAccess; ///< Device supports coherently accessing pageable memory + ///< without calling hipHostRegister on it + int concurrentManagedAccess; ///< Device can coherently access managed memory concurrently with + ///< the CPU + int computePreemptionSupported; ///< Is compute preemption supported on the device + int canUseHostPointerForRegisteredMem; ///< Device can access host registered memory with same + ///< address as the host + int cooperativeLaunch; ///< HIP device supports cooperative launch + int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple + ///< devices + size_t + sharedMemPerBlockOptin; ///< Per device m ax shared mem per block usable by special opt in + int pageableMemoryAccessUsesHostPageTables; ///< Device accesses pageable memory via the host's + ///< page tables + int directManagedMemAccessFromHost; ///< Host can directly access managed memory on the device + ///< without migration + int maxBlocksPerMultiProcessor; ///< Max number of blocks on CU + int accessPolicyMaxWindowSize; ///< Max value of access policy window + size_t reservedSharedMemPerBlock; ///< Shared memory reserved by driver per block + int hostRegisterSupported; ///< Device supports hipHostRegister + int sparseHipArraySupported; ///< Indicates if device supports sparse hip arrays + int hostRegisterReadOnlySupported; ///< Device supports using the hipHostRegisterReadOnly flag + ///< with hipHostRegistger + int timelineSemaphoreInteropSupported; ///< Indicates external timeline semaphore support + int memoryPoolsSupported; ///< Indicates if device supports hipMallocAsync and hipMemPool APIs + int gpuDirectRDMASupported; ///< Indicates device support of RDMA APIs + unsigned int gpuDirectRDMAFlushWritesOptions; ///< Bitmask to be interpreted according to + ///< hipFlushGPUDirectRDMAWritesOptions + int gpuDirectRDMAWritesOrdering; ///< value of hipGPUDirectRDMAWritesOrdering + unsigned int + memoryPoolSupportedHandleTypes; ///< Bitmask of handle types support with mempool based IPC + int deferredMappingHipArraySupported; ///< Device supports deferred mapping HIP arrays and HIP + ///< mipmapped arrays + int ipcEventSupported; ///< Device supports IPC events + int clusterLaunch; ///< Device supports cluster launch + int unifiedFunctionPointers; ///< Indicates device supports unified function pointers + int reserved[63]; ///< CUDA Reserved. + + int hipReserved[32]; ///< Reserved for adding new entries for HIP/CUDA. + + /* HIP Only struct members */ + char gcnArchName[256]; ///< AMD GCN Arch Name. HIP Only. + size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per CU. HIP Only. int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" ///< instructions. New for HIP. hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. - int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. - int pciDomainID; ///< PCI Domain ID - int pciBusID; ///< PCI Bus ID. - int pciDeviceID; ///< PCI Device ID. - size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor. - int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not. - int canMapHostMemory; ///< Check whether HIP can map host memory - int gcnArch; ///< DEPRECATED: use gcnArchName instead - char gcnArchName[256]; ///< AMD GCN Arch Name. - int integrated; ///< APU vs dGPU - int cooperativeLaunch; ///< HIP device supports cooperative launch - int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple devices - int maxTexture1DLinear; ///< Maximum size for 1D textures bound to linear memory - int maxTexture1D; ///< Maximum number of elements in 1D images - int maxTexture2D[2]; ///< Maximum dimensions (width, height) of 2D images, in image elements - int maxTexture3D[3]; ///< Maximum dimensions (width, height, depth) of 3D images, in image elements - unsigned int* hdpMemFlushCntl; ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register - unsigned int* hdpRegFlushCntl; ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register - size_t memPitch; ///