From 5931eb632b29570990241b9b31db273b6ddd89f4 Mon Sep 17 00:00:00 2001
From: Adel Johar <adel.johar@amd.com>
Date: Tue, 28 Jan 2025 17:13:25 +0100
Subject: [PATCH] Docs: Add xnack to unified memory page

---
 .../memory_management/unified_memory.rst      | 135 ++++++++++++++++++
 1 file changed, 135 insertions(+)
diff --git a/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst
index c253416928..111f5272ec 100644
--- a/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst
+++ b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst
@@ -113,6 +113,141 @@ allocator can be used.
 :sup:`1` Works only with ``XNACK=1`` and kernels with HMM support. First GPU
 access causes recoverable page-fault.
 
+.. _xnack:
+
+XNACK
+-----
+
+On specific GPU architectures (which is referenced in the previous table), there is an
+option to automatically migrate pages of memory between host and device. This is important
+for managed memory, where the locality of the data is important for performance.
+Depending on the system, page migration may be disabled by default in which case managed
+memory will act like pinned host memory and suffer degraded performance.
+
+**XNACK** describes the GPU's ability to retry memory accesses that failed due to a page fault
+(which normally would lead to a memory access error), and instead retrieve the missing page.
+
+This also affects memory allocated by the system as indicated by the first table in
+:ref:`unified memory allocators`.
+
+Below is a small example that demonstrates an explicit page fault and how **XNACK** affects
+the page fault behavior.
+
+.. code-block:: cpp
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    #define HIP_CHECK(expression)              \
+    {                                          \
+        const hipError_t err = expression;     \
+        if(err != hipSuccess){                 \
+            std::cerr << "HIP error: "         \
+                << hipGetErrorString(err)      \
+                << " at " << __LINE__ << "\n"; \
+        }                                      \
+        exit(EXIT_FAILURE);                    \
+    }
+
+    __global__ void write_to_memory(int* data, int size)
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < size)
+        {
+            // Writing to memory that may not have been allocated in GPU memory
+            data[idx] = idx * 2; // Triggers a page fault if not resident
+        }
+    }
+
+    int main()
+    {
+        const int N = 1024; // 1K elements
+        const int blocksize = 256;
+        int* data;
+
+        // Allocate unified memory
+        HIP_CHECK(hipMallocManaged(&data, N * sizeof(int)));
+
+        // Intentionally don't initialize or prefetch any part of the data
+        // No initialization: data is uninitialized but accessible
+
+        // Launch kernel that writes to all elements
+        dim3 threads(blocksize);
+        dim3 grids(N / blocksize)
+        hipLaunchKernelGGL(write_to_memory, grids, threads, 0, 0, data, N);
+
+        // Synchronize to ensure kernel execution and fault resolution
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Check results
+        bool pass = true;
+        for (int i = 0; i < 10; ++i)
+        {
+            if (data[i] != (i * 2))
+            {
+                pass = false;
+                std::cout << "Failed at position" << i << " with value " << data[i] <<std::endl;
+                break;
+            }
+        }
+
+        if (pass)
+        {
+            std::cout << "Passed" << std::endl;
+        }
+
+        // Free memory
+        HIP_CHECK(hipFree(data));
+        return 0;
+    }
+
+
+The key behaviors in the example above are as follows:-
+
+#. | No Prefetch or Initialization: The memory is allocated using hipMallocManaged, but
+   | it's not initialized or explicitly prefetched to the GPU.
+
+#. | Kernel Write: The kernel writes to the entire array, including memory locations
+   | that haven't been allocated in GPU memory yet. This triggers page faults for pages
+   | not currently mapped to the GPU.
+
+#. | If **XNACK** is enabled, page faults are handled gracefully: the runtime allocates
+   | or fetches the missing pages as needed, ensuring correct execution. If **XNACK** is
+   | disabled, the GPU would not handle the page faults, leading to undefined behavior.
+
+To check if page migration is available on a platform, use ``rocminfo``:
+
+.. code-block:: bash
+
+    $ rocminfo | grep xnack
+    Name:                    amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack-
+
+Here, ``xnack-`` means that XNACK is available but is disabled by default.
+Turning on XNACK by setting the environment variable ``HSA_XNACK=1`` gives
+the expected result, ``xnack+``:
+
+.. code-block:: bash
+
+    $ HSA_XNACK=1 rocminfo | grep xnack
+    Name:                    amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack+
+
+``hipcc`` by default generates code that runs correctly with both XNACK enabled or disabled.
+Setting the ``--offload-arch=``-option with ``xnack+`` or ``xnack-`` forces code to
+be only run with XNACK enabled or disabled respectively.
+
+.. code-block:: bash
+
+    # Compiled kernels will run regardless if XNACK is enabled or is disabled.
+    hipcc --offload-arch=gfx90a
+
+    # Compiled kernels will only run with XNACK enabled (XNACK=1)
+    # If XNACK is disabled, execution will fail because no compatible kernel is available.
+    hipcc --offload-arch=gfx90a:xnack+
+
+    # Compiled kernels will only run with XNACK disabled (XNACK=0)
+    # If XNACK is enabled, execution will fail because no compatible kernel is available.
+    hipcc --offload-arch=gfx90a:xnack-
+
 .. _unified memory allocators:
 
 Unified memory allocators