Improve memory leak

NVIDIA · Nov 9, 2024 · 7b25b15 · 7b25b15
1 parent 6e647ca
commit 7b25b15
Show file tree

Hide file tree

Showing 6 changed files with 13 additions and 47 deletions.
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
@@ -795,12 +795,6 @@ class AllocTracker {
   /// Returns true if the ptr is released internally.
   bool isReleasedInternally(uintptr_t ptr) const;
 
-  /// Set the pointer is allocated by TensorRT.
-  void setTensorRTAllocated(uintptr_t ptr);
-
-  /// Get that pointer is allocated by TensorRT.
-  bool getTensorRTAllocated(uintptr_t ptr);
-
 private:
   struct Metadata {
     std::atomic<int32_t> externalReferenceCount = {0};

diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -27,6 +27,7 @@
 #include "mlir-executor/Runtime/API/API.h"
 #include "mlir-executor/Runtime/API/ExecutableFlatbuffer.h"
 #include "mlir-executor/Runtime/Backend/Lua/LuaRuntime.h"
+#include "mlir-executor/Runtime/Support/Support.h"
 #include "mlir-executor/Support/Status.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/Support/Debug.h"
@@ -324,9 +325,9 @@ MTRT_Status mtrtMemRefCreateExternal(
 
 MTRT_Status mtrtMemRefValueDestroyAsync(MTRT_MemRefValue buffer,
                                         MTRT_Stream stream) {
-
   MemRefValue *memref = unwrap(buffer);
-  llvm::dbgs() << "[MLIR-TRT] Deallocating memref pointer " << memref->getMemory() << "\n";
+  MTRT_DBGF("destroying memref pointer 0x%lx asynchronously",
+            memref->getMemory());
   Status s = memref->getClient()->deallocate(
       std::unique_ptr<MemRefValue>(memref),
       mtrtStreamIsNull(stream) ? std::nullopt
@@ -338,7 +339,7 @@ MTRT_Status mtrtMemRefValueDestroyAsync(MTRT_MemRefValue buffer,
 
 MTRT_Status mtrtMemRefValueDestroy(MTRT_MemRefValue buffer) {
   MemRefValue *memref = unwrap(buffer);
-  llvm::dbgs() << "[MLIR-TRT] Deallocating memref pointer " << memref->getMemory() << "\n";
+  MTRT_DBGF("destroying memref pointer 0x%lx", memref->getMemory());
   Status s =
       memref->getClient()->deallocate(std::unique_ptr<MemRefValue>(memref));
   if (!s.isOk())

diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
@@ -396,20 +396,6 @@ AllocTracker::~AllocTracker() {
     MTRT_DBGF("freed %zu bytes of unfreed memory", totalSize);
 }
 
-void AllocTracker::setTensorRTAllocated(uintptr_t ptr) {
-  assert(llvm::is_contained(map, ptr) &&
-         llvm::formatv("Untracked pointer {0}", ptr).str().c_str());
-  std::unique_ptr<Metadata> const &metadata = map.at(ptr);
-  metadata->tensorrtAllocated = true;
-}
-
-bool AllocTracker::getTensorRTAllocated(uintptr_t ptr) {
-  assert(llvm::is_contained(map, ptr) &&
-         llvm::formatv("Untracked pointer {0}", ptr).str().c_str());
-  std::unique_ptr<Metadata> const &metadata = map.at(ptr);
-  return metadata->tensorrtAllocated;
-}
-
 void AllocTracker::markReleasedInternally(uintptr_t ptr) {
   assert(llvm::is_contained(map, ptr) &&
          llvm::formatv("Untracked pointer {0}", ptr).str().c_str());

diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/CUDA/CUDAModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/CUDA/CUDAModule.cpp
@@ -432,13 +432,9 @@ registerCudaMemoryManagementOps(sol::state_view &lua,
                                                       cudaMemcpyDeviceToHost,
                                                       stream),
                                       state);
-        if (allocTracker->getTensorRTAllocated(
-                reinterpret_cast<uintptr_t>(srcPtr))) {
-          // Free tensorrt allocate source pointer, since there it won't be
-          // released by external memref.
-          SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state);
-          allocTracker->untrack(reinterpret_cast<uintptr_t>(srcPtr));
-        }
+        if (allocTracker->get(src).isInternallyManaged() &&
+            allocTracker->getExternalReferenceCount(src))
+          allocTracker->markReleasedInternally(src);
       };
 
   lua["__cuda_memcpy_host_pinned2device"] =
@@ -487,13 +483,9 @@ registerCudaMemoryManagementOps(sol::state_view &lua,
                                                       cudaMemcpyDeviceToHost,
                                                       stream),
                                       state);
-        if (allocTracker->getTensorRTAllocated(
-                reinterpret_cast<uintptr_t>(srcPtr))) {
-          // Free tensorrt allocate source pointer, since there it won't be
-          // released by external memref.
-          SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state);
-          allocTracker->untrack(reinterpret_cast<uintptr_t>(srcPtr));
-        }
+        if (allocTracker->get(src).isInternallyManaged() &&
+            allocTracker->getExternalReferenceCount(src))
+          allocTracker->markReleasedInternally(src);
       };
   lua["__cuda_memcpy_device2device"] = [allocTracker](
                                            sol::this_state state,
@@ -518,13 +510,9 @@ registerCudaMemoryManagementOps(sol::state_view &lua,
                                                   cudaMemcpyDeviceToDevice,
                                                   stream),
                                   state);
-    if (allocTracker->getTensorRTAllocated(
-            reinterpret_cast<uintptr_t>(srcPtr))) {
-      // Free tensorrt allocate source pointer, since there it won't be
-      // released by external memref.
-      SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state);
-      allocTracker->untrack(reinterpret_cast<uintptr_t>(srcPtr));
-    }
+        if (allocTracker->get(src).isInternallyManaged() &&
+            allocTracker->getExternalReferenceCount(src))
+          allocTracker->markReleasedInternally(src);
     return;
   };
 }

diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
@@ -155,7 +155,6 @@ class OutputAllocatorImpl : public nvinfer1::IOutputAllocator {
       if (memory.isOk()) {
         mOutputPtr = (*memory).ptr;
         mOutputSize = memory->size;
-        mTracker->setTensorRTAllocated(memory->ptr);
         MTRT_DBGF(
             "tensorrt module output allocator allocating %lu bytes at 0x%lx",
             mOutputSize, mOutputPtr);

diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
@@ -346,8 +346,6 @@ static std::unique_ptr<PyMemRefValue>
 createMemRefViewFromDLPack(PyRuntimeClient &client, py::capsule capsule,
                            std::optional<bool> assertCanonicalStrides) {
 
-  llvm::dbgs() << "Creating a memref view from DL pack tensors\n";
-
   DLManagedTensor *managedTensor = static_cast<DLManagedTensor *>(
       PyCapsule_GetPointer(capsule.ptr(), "dltensor"));