Improve memory leak

NVIDIA · Nov 9, 2024 · 9334481 · 9334481
1 parent 6e647ca
commit 9334481
Show file tree

Hide file tree

Showing 9 changed files with 75 additions and 59 deletions.
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
@@ -795,11 +795,11 @@ class AllocTracker {
   /// Returns true if the ptr is released internally.
   bool isReleasedInternally(uintptr_t ptr) const;
 
-  /// Set the pointer is allocated by TensorRT.
-  void setTensorRTAllocated(uintptr_t ptr);
+  /// Mark pointer for release after consumption
+  void markForReleaseAfterConsumption(uintptr_t ptr);
 
-  /// Get that pointer is allocated by TensorRT.
-  bool getTensorRTAllocated(uintptr_t ptr);
+  /// Check if pointer is marked for release after consumption
+  bool isMarkedForReleaseAfterConsumption(uintptr_t ptr);
 
 private:
   struct Metadata {
@@ -808,7 +808,7 @@ class AllocTracker {
     // if this is true then it should be truelly released and untracked
     // when decrementExternalCount causes count to go to zero
     bool releasedInternally{false};
-    bool tensorrtAllocated{false};
+    bool releaseAfterConsumption{false};
     PointerInfo info;
   };
 
@@ -877,7 +877,7 @@ class RuntimeSession {
 
   ExecutableView getExecutable() const { return executable; }
 
-  PinnedMemoryAllocator &getPinnedMemorAllocator() {
+  PinnedMemoryAllocator &getPinnedMemoryAllocator() {
     return *pinnedMemoryAllocator;
   }
 
@@ -975,7 +975,7 @@ class RuntimeClient {
   ResourceTracker &getResourceTracker() { return resourceTracker; }
 
   /// Return the PinnedMemoryAllocator.
-  PinnedMemoryAllocator &getPinnedMemorAllocator() {
+  PinnedMemoryAllocator &getPinnedMemoryAllocator() {
     return pinnedMemoryAllocator;
   }
 

diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
@@ -104,7 +104,11 @@ class PinnedMemoryAllocator {
   PinnedMemoryAllocator();
   ~PinnedMemoryAllocator();
 
-  /// Untracks
+  /// Marks a pointer as client-managed, deferring its deallocation
+  /// This method is used when a pinned memory pointer is returned to the client
+  /// and its lifecycle is no longer managed by the PinnedMemoryAllocator.
+  /// Pointers marked this way will not be automatically freed in the
+  /// allocator's destructor.
   void untrack(uintptr_t ptr);
 
   StatusOr<PinnedMemoryBlock> allocate(size_t size);
@@ -117,8 +121,8 @@ class PinnedMemoryAllocator {
 private:
   EventPool eventPool;
 
-  /// Tracks all the pointers which need not to freed up.
-  static std::vector<uintptr_t> untrackedPtrs;
+  /// Stores pointers to memory blocks that are now managed by the client.
+  static std::vector<uintptr_t> clientManagedPtrs;
 
   /// Tracks all blocks allocated by the allocator.
   struct BlockTracker;

diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -27,6 +27,7 @@
 #include "mlir-executor/Runtime/API/API.h"
 #include "mlir-executor/Runtime/API/ExecutableFlatbuffer.h"
 #include "mlir-executor/Runtime/Backend/Lua/LuaRuntime.h"
+#include "mlir-executor/Runtime/Support/Support.h"
 #include "mlir-executor/Support/Status.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/Support/Debug.h"
@@ -324,9 +325,9 @@ MTRT_Status mtrtMemRefCreateExternal(
 
 MTRT_Status mtrtMemRefValueDestroyAsync(MTRT_MemRefValue buffer,
                                         MTRT_Stream stream) {
-
   MemRefValue *memref = unwrap(buffer);
-  llvm::dbgs() << "[MLIR-TRT] Deallocating memref pointer " << memref->getMemory() << "\n";
+  MTRT_DBGF("destroying memref pointer 0x%lx asynchronously",
+            memref->getMemory());
   Status s = memref->getClient()->deallocate(
       std::unique_ptr<MemRefValue>(memref),
       mtrtStreamIsNull(stream) ? std::nullopt
@@ -338,7 +339,7 @@ MTRT_Status mtrtMemRefValueDestroyAsync(MTRT_MemRefValue buffer,
 
 MTRT_Status mtrtMemRefValueDestroy(MTRT_MemRefValue buffer) {
   MemRefValue *memref = unwrap(buffer);
-  llvm::dbgs() << "[MLIR-TRT] Deallocating memref pointer " << memref->getMemory() << "\n";
+  MTRT_DBGF("destroying memref pointer 0x%lx", memref->getMemory());
   Status s =
       memref->getClient()->deallocate(std::unique_ptr<MemRefValue>(memref));
   if (!s.isOk())

diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
@@ -396,18 +396,18 @@ AllocTracker::~AllocTracker() {
     MTRT_DBGF("freed %zu bytes of unfreed memory", totalSize);
 }
 
-void AllocTracker::setTensorRTAllocated(uintptr_t ptr) {
+void AllocTracker::markForReleaseAfterConsumption(uintptr_t ptr) {
   assert(llvm::is_contained(map, ptr) &&
          llvm::formatv("Untracked pointer {0}", ptr).str().c_str());
   std::unique_ptr<Metadata> const &metadata = map.at(ptr);
-  metadata->tensorrtAllocated = true;
+  metadata->releaseAfterConsumption = true;
 }
 
-bool AllocTracker::getTensorRTAllocated(uintptr_t ptr) {
+bool AllocTracker::isMarkedForReleaseAfterConsumption(uintptr_t ptr) {
   assert(llvm::is_contained(map, ptr) &&
          llvm::formatv("Untracked pointer {0}", ptr).str().c_str());
   std::unique_ptr<Metadata> const &metadata = map.at(ptr);
-  return metadata->tensorrtAllocated;
+  return metadata->releaseAfterConsumption;
 }
 
 void AllocTracker::markReleasedInternally(uintptr_t ptr) {
@@ -486,8 +486,8 @@ void AllocTracker::track(PointerInfo info) {
   auto value = std::make_unique<Metadata>();
   value->externalReferenceCount.store(0);
   value->releasedInternally = false;
+  value->releaseAfterConsumption = false;
   value->info = info;
-  value->tensorrtAllocated = false;
   if (!contains(info.ptr)) {
     map.insert(std::make_pair(info.ptr, std::move(value)));
     return;
@@ -1001,7 +1001,7 @@ RuntimeClient::copyToDevice(const MemRefValue &hostBufferImpl,
     // TODO: Currently, this implementation supports only row major packed
     // canonical layout (no padding).
     StatusOr<mlirtrt::PinnedMemoryBlock> pinnedMemory =
-        this->getPinnedMemorAllocator().allocate(totalBufferSize);
+        this->getPinnedMemoryAllocator().allocate(totalBufferSize);
     if (!pinnedMemory.isOk())
       return pinnedMemory.getStatus();
 
@@ -1020,7 +1020,7 @@ RuntimeClient::copyToDevice(const MemRefValue &hostBufferImpl,
                         reinterpret_cast<cudaStream_t>(*cudaStream)));
 
     // Free pinned host memory asynchronously.
-    getPinnedMemorAllocator().freeAsync(pinnedMemory->ptr, *cudaStream);
+    getPinnedMemoryAllocator().freeAsync(pinnedMemory->ptr, *cudaStream);
   } else {
     MTRT_DBG("synchronously copying {0} (host) to {1} (device), size={2} bytes",
              hostBufferImpl.getVoidPtr(), (*deviceMemRef)->getVoidPtr(),

diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
@@ -152,7 +152,7 @@ LuaRuntimeSession::create(RuntimeSessionOptions options,
 
   // Register builtin methods.
   registerLuaRuntimeMethods(lua.lua_state(), session->getOptions(),
-                            &session->getPinnedMemorAllocator(),
+                            &session->getPinnedMemoryAllocator(),
                             &session->getAllocTracker(),
                             &session->getResourceTracker());
 
@@ -624,22 +624,22 @@ parseResults(const sol::protected_function_result &pfr,
               "This ptr is registered with the session and will now be tracked "
               "by the client as well.",
               allocPtr, static_cast<void *>(&session.getAllocTracker()),
-              static_cast<void *>(&session.getPinnedMemorAllocator()),
+              static_cast<void *>(&session.getPinnedMemoryAllocator()),
               static_cast<void *>(*client),
               static_cast<void *>(&(*client)->getAllocTracker()));
 
     // We need here actually is to "release" the pointer from the session
     // ownership and have the client assume
     PointerInfo info = session.getAllocTracker().get(allocPtr);
     session.getAllocTracker().untrack(info.ptr);
-
-    // It is possible that pinned memory also tracks the memory for
-    // deallocation.
-    session.getPinnedMemorAllocator().untrack(info.ptr);
-
-    AllocTracker &allocator = (*client)->getAllocTracker();
-    // if (!allocator.contains(info.ptr))
-    allocator.track(info);
+    (*client)->getAllocTracker().track(info);
+
+    // Defer deallocation of this pinned memory pointer
+    // This pointer is likely still in use by the client and should not be
+    // immediately freed. By untracking it here, we ensure it won't be
+    // deallocated in the PinnedMemoryAllocator's destructor, allowing
+    // the client to manage its lifecycle.
+    session.getPinnedMemoryAllocator().untrack(info.ptr);
 
     // Create a memref so that client now tracks it.
     auto memref = MemRefValue::create(

diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/CUDA/CUDAModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/CUDA/CUDAModule.cpp
@@ -432,12 +432,14 @@ registerCudaMemoryManagementOps(sol::state_view &lua,
                                                       cudaMemcpyDeviceToHost,
                                                       stream),
                                       state);
-        if (allocTracker->getTensorRTAllocated(
-                reinterpret_cast<uintptr_t>(srcPtr))) {
-          // Free tensorrt allocate source pointer, since there it won't be
-          // released by external memref.
-          SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state);
-          allocTracker->untrack(reinterpret_cast<uintptr_t>(srcPtr));
+        // Check if the source pointer is marked for release after consumption
+        if (allocTracker->isMarkedForReleaseAfterConsumption(src)) {
+          // This pointer was allocated by TensorRT and used in a device-device
+          // or device-host copy operation. It's not wrapped in a memref, so it
+          // won't be released by external memref destruction. We need to
+          // explicitly free it.
+          SET_LUA_ERROR_IF_ERROR(runtime::safeDeallocate(*allocTracker, src),
+                                 state);
         }
       };
 
@@ -487,12 +489,14 @@ registerCudaMemoryManagementOps(sol::state_view &lua,
                                                       cudaMemcpyDeviceToHost,
                                                       stream),
                                       state);
-        if (allocTracker->getTensorRTAllocated(
-                reinterpret_cast<uintptr_t>(srcPtr))) {
-          // Free tensorrt allocate source pointer, since there it won't be
-          // released by external memref.
-          SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state);
-          allocTracker->untrack(reinterpret_cast<uintptr_t>(srcPtr));
+        // Check if the source pointer is marked for release after consumption
+        if (allocTracker->isMarkedForReleaseAfterConsumption(src)) {
+          // This pointer was allocated by TensorRT and used in a device-device
+          // or device-host copy operation. It's not wrapped in a memref, so it
+          // won't be released by external memref destruction. We need to
+          // explicitly free it.
+          SET_LUA_ERROR_IF_ERROR(runtime::safeDeallocate(*allocTracker, src),
+                                 state);
         }
       };
   lua["__cuda_memcpy_device2device"] = [allocTracker](
@@ -518,12 +522,14 @@ registerCudaMemoryManagementOps(sol::state_view &lua,
                                                   cudaMemcpyDeviceToDevice,
                                                   stream),
                                   state);
-    if (allocTracker->getTensorRTAllocated(
-            reinterpret_cast<uintptr_t>(srcPtr))) {
-      // Free tensorrt allocate source pointer, since there it won't be
-      // released by external memref.
-      SET_LUA_ERROR_IF_CUDART_ERROR(cudaFreeAsync(srcPtr, stream), state);
-      allocTracker->untrack(reinterpret_cast<uintptr_t>(srcPtr));
+    // Check if the source pointer is marked for release after consumption
+    if (allocTracker->isMarkedForReleaseAfterConsumption(src)) {
+      // This pointer was allocated by TensorRT and used in a device-device
+      // or device-host copy operation. It's not wrapped in a memref, so it
+      // won't be released by external memref destruction. We need to
+      // explicitly free it.
+      SET_LUA_ERROR_IF_ERROR(runtime::safeDeallocate(*allocTracker, src),
+                              state);
     }
     return;
   };

diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
@@ -155,7 +155,12 @@ class OutputAllocatorImpl : public nvinfer1::IOutputAllocator {
       if (memory.isOk()) {
         mOutputPtr = (*memory).ptr;
         mOutputSize = memory->size;
-        mTracker->setTensorRTAllocated(memory->ptr);
+        // Mark the output pointer for release after consumption
+        // This is necessary because TensorRT-allocated pointers used in device-device
+        // or device-host copies may not be wrapped in a memref and tracked by the client.
+        // By marking it here, we ensure it will be explicitly freed after it's consumed
+        // in copy operations, preventing memory leaks.
+        mTracker->markForReleaseAfterConsumption(mOutputPtr);
         MTRT_DBGF(
             "tensorrt module output allocator allocating %lu bytes at 0x%lx",
             mOutputSize, mOutputPtr);

diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
@@ -206,7 +206,7 @@ static void cudaFreeHostWrapper(uintptr_t ptr) {
 #endif
 }
 
-std::vector<uintptr_t> PinnedMemoryAllocator::untrackedPtrs;
+std::vector<uintptr_t> PinnedMemoryAllocator::clientManagedPtrs;
 
 struct PinnedMemoryAllocator::BlockTracker {
   std::set<Block *, BlockComparison> blocks;
@@ -218,13 +218,14 @@ struct PinnedMemoryAllocator::BlockTracker {
         "[PinnedMemoryAllocator] Releasing block tracker that has %lu blocks",
         blocks.size());
     for (Block *block : blocks) {
-      if (std::find(PinnedMemoryAllocator::untrackedPtrs.begin(),
-                    PinnedMemoryAllocator::untrackedPtrs.end(),
-                    block->ptr) == PinnedMemoryAllocator::untrackedPtrs.end()) {
+      if (std::find(clientManagedPtrs.begin(), clientManagedPtrs.end(),
+                    block->ptr) == clientManagedPtrs.end()) {
         ALLOC_DBGF("[PinnedMemoryAllocator] releasing block %lu of size %lu",
                    block->ptr, block->size);
         cudaFreeHostWrapper(block->ptr);
       }
+      // Blocks found in clientManagedPtrs are not freed here, as they are now
+      // managed by the client
     }
   }
 };
@@ -251,7 +252,7 @@ StatusOr<PinnedMemoryBlock> PinnedMemoryAllocator::allocate(size_t size) {
   if (lowerBound != freeBlocks->set.end()) {
     Block *result = *lowerBound;
     freeBlocks->set.erase(result);
-    ALLOC_DBGF("re-using block %lu of size %lu", result->ptr, result->size);
+    ALLOC_DBGF("re-using block %lx of size %lu", result->ptr, result->size);
     return PinnedMemoryBlock{result->ptr, result->size};
   }
 
@@ -275,10 +276,11 @@ StatusOr<PinnedMemoryBlock> PinnedMemoryAllocator::allocate(size_t size) {
 #endif
 }
 
-// Free the given block.
+std::vector<uintptr_t> clientManagedPtrs;
+
 void PinnedMemoryAllocator::untrack(uintptr_t ptr) {
-  if (!llvm::is_contained(untrackedPtrs, ptr)) {
-    untrackedPtrs.emplace_back(ptr);
+  if (!llvm::is_contained(clientManagedPtrs, ptr)) {
+    clientManagedPtrs.emplace_back(ptr);
   }
 }
 

diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
@@ -346,8 +346,6 @@ static std::unique_ptr<PyMemRefValue>
 createMemRefViewFromDLPack(PyRuntimeClient &client, py::capsule capsule,
                            std::optional<bool> assertCanonicalStrides) {
 
-  llvm::dbgs() << "Creating a memref view from DL pack tensors\n";
-
   DLManagedTensor *managedTensor = static_cast<DLManagedTensor *>(
       PyCapsule_GetPointer(capsule.ptr(), "dltensor"));