From 82a2a2d04a3ba7b8f5afa8c8646ef3df65d27546 Mon Sep 17 00:00:00 2001
From: Matthias Knorr <MKKnorr@web.de>
Date: Thu, 28 Nov 2024 11:09:33 +0100
Subject: [PATCH] Docs: Update device memory pages

---
 .../memory_management/device_memory.rst       | 280 ++++++++++++++++--
 .../device_memory/texture_fetching.rst        | 113 ++++---
 include/hip/hip_runtime_api.h                 |   2 +-
 3 files changed, 325 insertions(+), 70 deletions(-)

diff --git a/docs/how-to/hip_runtime_api/memory_management/device_memory.rst b/docs/how-to/hip_runtime_api/memory_management/device_memory.rst
index 8b040d40ec..aacb5268ed 100644
--- a/docs/how-to/hip_runtime_api/memory_management/device_memory.rst
+++ b/docs/how-to/hip_runtime_api/memory_management/device_memory.rst
@@ -1,52 +1,280 @@
 .. meta::
   :description: This chapter describes the device memory of the HIP ecosystem
                 ROCm software.
-  :keywords: AMD, ROCm, HIP, device memory
+  :keywords: AMD, ROCm, HIP, GPU, device memory, global, constant, texture, surface, shared
 
 .. _device_memory:
 
-*******************************************************************************
+********************************************************************************
 Device memory
-*******************************************************************************
+********************************************************************************
 
-Device memory exists on the device, e.g. on GPUs in the video random access
-memory (VRAM), and is accessible by the kernels operating on the device. Recent
-architectures use graphics double data rate (GDDR) synchronous dynamic
-random-access memory (SDRAM) such as GDDR6, or high-bandwidth memory (HBM) such
-as HBM2e. Device memory can be allocated as global memory, constant, texture or
-surface memory.
+Device memory is random access memory that is physically located on a GPU. In
+general it is memory with a bandwidth that is an order of magnitude higher
+compared to RAM available to the host. That high bandwidth is only available to
+on-device accesses, accesses from the host or other devices have to go over a
+special interface, usually the PCIe bus or AMD Infinity Fabric, which is
+considerably slower.
+
+On certain architectures like APUs, the GPU and CPU share the same physical
+memory.
+
+There is also a special local data share on-chip directly accessible to the
+:ref:`compute units <hardware_implementation>`, that can be used for shared
+memory.
+
+The physical device memory can be used to back up several different memory
+spaces in HIP:
 
 Global memory
 ================================================================================
 
-Read-write storage visible to all threads on a given device. There are
-specialized versions of global memory with different usage semantics which are
-typically backed by the same hardware, but can use different caching paths.
+Global memory is the general read-write accessible memory visible to all threads
+on a given device. Since variables located in global memory have to be marked
+with the ``__device__`` qualifier, this memory space is also referred to as
+device memory.
+
+Without explicitly copying it, it can only be accessed by the threads within a
+kernel operating on the device, however :ref:`unified memory` can be used to
+let the runtime manage this, if desired.
+
+Allocating global memory
+--------------------------------------------------------------------------------
+
+This memory needs to be explicitly allocated.
+
+It can be allocated from the host via the :ref:`HIP runtime memory management
+functions <memory_management_reference>` like :cpp:func:`hipMalloc`, or can be
+defined using the ``__device__`` qualifier on variables.
+
+It can also be allocated within a kernel using ``malloc`` or ``new``.
+The specified amount of memory is allocated by each thread that executes the
+instructions. The recommended way to allocate the memory depends on the use
+case. If the memory is intended to be shared between the threads of a block, it
+is generally beneficial to allocate on large block of memory, due the way the
+memory is accessed.
+
+.. note::
+   Memory allocated within a kernel can only be freed in kernels, not by the HIP
+   runtime on the host, like :cpp:func:`hipFree`. It is also not possible to
+   free device memory allocated on the host, with e.g. :cpp:func:`hipMalloc`, in
+   a kernel.
+
+
+An example for how to share memory allocated within a kernel by only one thread
+is given in the following example. In case the device memory is only needed for
+communication between the threads in a single block, :ref:`shared_memory` is the
+better option, but is also limited in size.
+
+.. code-block:: cpp
+
+  __global__ void kernel_memory_allocation(TYPE* pointer){
+  // The pointer is stored in shared memory, so that all
+  // threads of the block can access the pointer
+  __shared__ int *memory;
+
+  size_t blockSize = blockDim.x;
+  constexpr size_t elementsPerThread = 1024;
+  if(threadIdx.x == 0){
+    // allocate memory in one contiguous block
+    memory = new int[blockDim.x * elementsPerThread];
+  }
+  __syncthreads();
+
+  // load pointer into thread-local variable to avoid
+  // unnecessary accesses to shared memory
+  int *localPtr = memory;
+
+  // work with allocated memory, e.g. initialization
+  for(int i = 0; i < elementsPerThread; ++i){
+    // access in a contiguous way
+    localPtr[i * blockSize + threadIdx.x] = i;
+  }
+
+  // synchronize to make sure no thread is accessing the memory before freeing
+  __syncthreads();
+  if(threadIdx.x == 0){
+    delete[] memory;
+  }
+}
+
+Copying between device and host
+--------------------------------------------------------------------------------
+
+When not using :ref:`unified memory`, memory has to be explicitly copied between
+the device and the host, using the HIP runtime API.
+
+.. code-block:: cpp
+
+  size_t elements = 1 << 20;
+  size_t size_bytes = elements * sizeof(int);
+
+  // allocate host and device memory
+  int *host_pointer = new int[elements];
+  int *device_input, *device_result;
+  HIP_CHECK(hipMalloc(&device_input, size_bytes));
+  HIP_CHECK(hipMalloc(&device_result, size_bytes));
+
+  // copy from host to the device
+  HIP_CHECK(hipMemcpy(device_input, host_pointer, size_bytes, hipMemcpyHostToDevice));
+
+  // Use memory on the device, i.e. execute kernels
+
+  // copy from device to host, to e.g. get results from the kernel
+  HIP_CHECK(hipMemcpy(host_pointer, device_result, size_bytes, hipMemcpyDeviceToHost));
+
+  // free memory when not needed any more
+  HIP_CHECK(hipFree(device_result));
+  HIP_CHECK(hipFree(device_input));
+  delete[] host_pointer;
 
 Constant memory
 ================================================================================
 
-Read-only storage visible to all threads on a given device. It is a limited
-segment backed by device memory with queryable size. It needs to be set by the
-host before kernel execution. Constant memory provides the best performance
-benefit when all threads within a warp access the same address.
+Constant memory is read-only storage visible to all threads on a given device.
+It is a limited segment backed by device memory, that takes a different caching
+route than normal device memory accesses. It needs to be set by the host before
+kernel execution.
+
+In order to get the highest bandwidth from the constant memory, all threads of
+a warp have to access the same memory address. If they access different
+addresses, the accesses get serialized and the bandwidth is therefore reduced.
+
+Using constant memory
+--------------------------------------------------------------------------------
+
+Constant memory can not be dynamically allocated, and the size has to be
+specified during compile time. If the values can not be specified during compile
+time, they have to be set by the host before the kernel, that accesses the
+constant memory, is called.
+
+.. code-block:: cpp
+
+  constexpr size_t const_array_size = 32;
+  __constant__ double const_array[const_array_size];
+
+  void set_constant_memory(double* values){
+    hipMemcpyToSymbol(const_array, values, const_array_size * sizeof(double));
+  }
+
+  __global__ void kernel_using_const_memory(double* array){
+
+    int warpIdx = threadIdx.x / warpSize;
+    // uniform access of warps to const_array for best performance
+    array[blockDim.x] *= const_array[warpIdx];
+  }
 
 Texture memory
 ================================================================================
 
-Read-only storage visible to all threads on a given device and accessible
-through additional APIs. Its origins come from graphics APIs, and provides
-performance benefits when accessing memory in a pattern where the
-addresses are close to each other in a 2D representation of the memory. 
+Texture memory is special read-only memory visible to all threads on a given
+device and accessible through additional APIs. Its origins come from graphics
+APIs, and provides performance benefits when accessing memory in a pattern where
+the addresses are close to each other in a 2D or 3D representation of the
+memory. It also provides additional features like filtering and addressing for
+out-of-bounds accesses, which are further explained in :ref:`texture_fetching`.
+
+The original use of the texture cache was also to take pressure off the global
+memory and other caches, however on modern GPUs, that support textures, the L1
+cache and texture cache are combined, so the main purpose is to make use of the
+texture specific features.
+
+To find out whether textures are supported on a device, query
+:cpp:enumerator:`hipDeviceAttributeImageSupport`.
+
+Using texture memory
+--------------------------------------------------------------------------------
 
-The :ref:`texture management module <texture_management_reference>` of the HIP
-runtime API reference contains the functions of texture memory.
+Textures are more complex than just a region of memory, so their layout has to
+be specified. They are represented by ``hipTextureObject_t`` and created using
+:cpp:func:`hipCreateTextureObject`.
+
+The underlying memory is a 1D, 2D or 3D ``hipArray_t``, that needs to be
+allocated using :cpp:func:`hipMallocArray`.
+
+On the device side, texture objects are accessed using the ``tex1D/2D/3D``
+functions.
+
+The texture management functions can be found in the :ref:`texture management
+API reference <texture_management_reference>`
+
+A full example for how to use textures can be found in the `ROCm texture
+management example <https://github.com/ROCm/rocm-examples/blob/develop/HIP-Basic/texture_management/main.hip>`_
 
 Surface memory
 ================================================================================
 
-A read-write version of texture memory, which can be useful for applications
-that require direct manipulation of 1D, 2D, or 3D hipArray_t. 
+A read-write version of texture memory. It is created in the same way as a
+texture, but with :cpp:func:`hipCreateSurfaceObject`.
+
+Since surfaces also cached in the read-only texture cache, the changes written
+back to the surface can't be observed in the same kernel. A new kernel has to be
+launched in order to see the updated surface.
+
+The corresponding functions are listed in the :ref:`surface object API reference
+<surface_object_reference>`.
+
+Shared memory
+================================================================================
+
+Shared memory is read-write memory, that is only visible to the threads within a
+block. It is allocated per thread block, and needs to be either statically
+allocated at compile time, or can be dynamically allocated when launching the
+kernel, but not during kernel execution. Its general use-case is to share
+variables between the threads within a block, but can also be used as scratch
+pad memory.
+
+Shared memory is not backed by the same physical memory as the other address
+spaces. It is on-chip memory local to the :ref:`compute units
+<hardware_implementation>`, providing low-latency, high-bandwidth access,
+comparable to the L1 cache. It is however limited in size, and as it is
+allocated per block, can restrict how many blocks can be scheduled to a compute
+unit concurrently, thereby potentially reducing occupancy.
+
+An overview of the size of the local data share (LDS), that backs up shared
+memory, is given in the
+:doc:`GPU hardware specifications <rocm:reference/gpu-arch-specs>`.
+
+Allocate shared memory
+--------------------------------------------------------------------------------
+
+Memory can be dynamically allocated by declaring an ``extern __shared__`` array,
+whose size can be set during kernel launch, which can then be accessed in the
+kernel.
+
+.. code-block:: cpp
+
+  extern __shared__ int dynamic_shared[];
+  __global__ void kernel(int sizeX, int sizeY, int totalSize){
+    int* array1 = dynamic_shared;
+    // array1 is interpreted as 2D of size:
+    int array1_size = sizeX * sizeY;
+    int* array2 = &(array[sizeX * sizeY]);
+
+    if(threadIdx.x < array1_size){
+      // access array1
+    }
+    if(threadIdx.x < (totalSize - array1_size)){
+      // access array2
+    }
+  }
+
+A more in-depth example on dynamically allocated shared memory can be found in
+the  `ROCm dynamic shared example
+<https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic/dynamic_shared>`_.
+
+To statically allocate shared memory, just declare it in the kernel. The memory
+is allocated per block, not per thread. If the kernel requires more shared
+memory than is available to the architecture, the compilation fails.
+
+.. code-block:: cpp
+
+  __global__ void kernel(){
+    __shared__ int array[128];
+    __shared__ double result;
+  }
+
+A more in-depth example on statically allocated shared memory can be found in
+the  `ROCm shared memory example
+<https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic/shared_memory>`_.
 
-The :ref:`surface objects module <surface_object_reference>` of HIP runtime API
-contains the functions for creating, destroying and reading surface memory.
\ No newline at end of file
diff --git a/docs/how-to/hip_runtime_api/memory_management/device_memory/texture_fetching.rst b/docs/how-to/hip_runtime_api/memory_management/device_memory/texture_fetching.rst
index a7f2873dd5..646d8afca6 100644
--- a/docs/how-to/hip_runtime_api/memory_management/device_memory/texture_fetching.rst
+++ b/docs/how-to/hip_runtime_api/memory_management/device_memory/texture_fetching.rst
@@ -5,56 +5,67 @@
 
 .. _texture_fetching:
 
-*******************************************************************************
+********************************************************************************
 Texture fetching
-*******************************************************************************
-
-`Textures <../../../../doxygen/html/group___texture.html>`_ are more than just a buffer
-interpreted as a 1D, 2D, or 3D array.
-
-As textures are associated with graphics, they are indexed using floating-point
-values. The index can be in the range of [0 to size-1] or [0 to 1].
-
-Depending on the index, texture sampling or texture addressing is performed,
-which decides the return value.
-
-**Texture sampling**: When a texture is indexed with a fraction, the queried
-value is often between two or more texels (texture elements). The sampling
-method defines what value to return in such cases.
-
-**Texture addressing**: Sometimes, the index is outside the bounds of the
-texture. This condition might look like a problem but helps to put a texture on
-a surface multiple times or to create a visible sign of out-of-bounds indexing,
-in computer graphics. The addressing mode defines what value to return when
-indexing a texture out of bounds.
-
-The different sampling and addressing modes are described in the following
-sections.
-
-Here is the sample texture used in this document for demonstration purposes. It
+********************************************************************************
+
+Textures give access to specialized hardware on GPUs that is usually used in
+graphics processing. In particular, textures use a different way of accessing
+their underlying device memory. Memory accesses to textures are routed through
+a special read-only texture cache, that is optimized for logical spatial
+locality, e.g. locality in 2D grids. This can also benefit certain algorithms
+used in GPGPU computing, when the access pattern is the same as used when
+accessing normal textures.
+
+Additionally, textures can be indexed using floating-point values. This is used
+in graphics applications to interpolate between neighboring values of a texture.
+Depending on the interpolation mode the index can be in the range of ``0`` to
+``size - 1`` or ``0`` to ``1``. Textures also have a way of handling
+out-of-bounds accesses.
+
+Depending on the value of the index, :ref:`texture filtering <texture_filtering>`
+or :ref:`texture addressing <texture_addressing>` is performed.
+
+Here is the example texture used in this document for demonstration purposes. It
 is 2x2 texels and indexed in the [0 to 1] range.
 
 .. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/original.png
   :width: 150
-  :alt: Sample texture
+  :alt: Example texture
   :align: center
 
   Texture used as example
 
-Texture sampling
-===============================================================================
+In HIP textures objects are of type :cpp:struct:`hipTextureObject_t` and created
+using :cpp:func:`hipCreateTextureObject`.
+
+For a full list of available texture functions see the :ref:`HIP texture API
+reference <texture_management_reference>`.
+
+A code example for how to use textures can be found in the `ROCm texture
+management example <https://github.com/ROCm/rocm-examples/blob/develop/HIP-Basic/texture_management/main.hip>`_
+
+.. _texture_filtering:
 
-Texture sampling handles the usage of fractional indices. It is the method that
-describes, which nearby values will be used, and how they are combined into the
-resulting value.
+Texture filtering
+================================================================================
 
-The various texture sampling methods are discussed in the following sections.
+Texture filtering handles the usage of fractional indices. When the index is a
+fraction, the queried value lies between two or more texels (texture elements),
+depending on the dimensionality of the texture. The filtering method defines how
+to interpolate between these values.
+
+The filter modes are specified in :cpp:enumerator:`hipTextureFilterMode`.
+
+The various texture filtering methods are discussed in the following sections.
 
 .. _texture_fetching_nearest:
 
-Nearest point sampling
+Nearest point filtering
 -------------------------------------------------------------------------------
 
+This filter mode corresponds to ``hipFilterModePoint``.
+
 In this method, the modulo of index is calculated as:
 
 ``tex(x) = T[floor(x)]``
@@ -70,22 +81,24 @@ of the nearest texel.
 
 .. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/nearest.png
   :width: 300
-  :alt: Texture upscaled with nearest point sampling
+  :alt: Texture upscaled with nearest point filtering
   :align: center
 
-  Texture upscaled with nearest point sampling
+  Texture upscaled with nearest point filtering
 
 .. _texture_fetching_linear:
 
 Linear filtering
 -------------------------------------------------------------------------------
 
+This filter mode corresponds to ``hipFilterModeLinear``.
+
 The linear filtering method does a linear interpolation between values. Linear
 interpolation is used to create a linear transition between two values. The
 formula used is ``(1-t)P1 + tP2`` where ``P1`` and ``P2`` are the values and
 ``t`` is within the [0 to 1] range.
 
-In the case of texture sampling the following formulas are used:
+In the case of linear texture filtering the following formulas are used:
 
 * For one dimensional textures: ``tex(x) = (1-α)T[i] + αT[i+1]``
 * For two dimensional textures: ``tex(x,y) = (1-α)(1-β)T[i,j] + α(1-β)T[i+1,j] + (1-α)βT[i,j+1] + αβT[i+1,j+1]``
@@ -95,7 +108,7 @@ Where x, y, and, z are the floating-point indices. i, j, and, k are the integer
 indices and, α, β, and, γ values represent how far along the sampled point is on
 the three axes. These values are calculated by these formulas: ``i = floor(x')``, ``α = frac(x')``, ``x' = x - 0.5``, ``j = floor(y')``, ``β = frac(y')``, ``y' = y - 0.5``, ``k = floor(z')``, ``γ = frac(z')`` and ``z' = z - 0.5``
 
-This following image shows a texture stretched out to a 4x4 pixel quad, but
+The following image shows a texture stretched out to a 4x4 pixel quad, but
 still indexed in the [0 to 1] range. The in-between values are interpolated
 between the neighboring texels.
 
@@ -106,12 +119,18 @@ between the neighboring texels.
 
   Texture upscaled with linear filtering
 
+.. _texture_addressing:
+
 Texture addressing
 ===============================================================================
 
-Texture addressing mode handles the index that is out of bounds of the texture.
-This mode describes which values of the texture or a preset value to use when
-the index is out of bounds.
+The texture addressing modes are specified in
+:cpp:enumerator:`hipTextureAddressMode`.
+
+The texture addressing mode handles out-of-bounds accesses to the texture. This
+can be used in graphics applications to e.g. repeat a texture on a surface
+multiple times in various ways or create visible signs of out-of-bounds
+indexing.
 
 The following sections describe the various texture addressing methods.
 
@@ -120,8 +139,10 @@ The following sections describe the various texture addressing methods.
 Address mode border
 -------------------------------------------------------------------------------
 
-In this method, the texture fetching returns a border value when indexing out of
-bounds. The border value must be set before texture fetching.
+This addressing mode is set using ``hipAddressModeBorder``.
+
+This addressing mode returns a border value when indexing out of bounds. The
+border value must be set before texture fetching.
 
 The following image shows the texture on a 4x4 pixel quad, indexed in the
 [0 to 3] range. The out-of-bounds values are the border color, which is yellow.
@@ -141,6 +162,8 @@ the addressing begins.
 Address mode clamp
 -------------------------------------------------------------------------------
 
+This addressing mode is set using ``hipAddressModeClamp``.
+
 This mode clamps the index between [0 to size-1]. Due to this, when indexing
 out-of-bounds, the values on the edge of the texture repeat. The clamp mode is
 the default addressing mode.
@@ -164,6 +187,8 @@ the addressing begins.
 Address mode wrap
 -------------------------------------------------------------------------------
 
+This addressing mode is set using ``hipAddressModeWrap``.
+
 Wrap mode addressing is only available for normalized texture coordinates. In
 this addressing mode, the fractional part of the index is used:
 
@@ -189,6 +214,8 @@ the addressing begins.
 Address mode mirror
 -------------------------------------------------------------------------------
 
+This addressing mode is set using ``hipAddressModeMirror``.
+
 Similar to the wrap mode the mirror mode is only available for normalized
 texture coordinates and also creates a repeating image, but mirroring the
 neighboring instances.
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index 480a6ca7c9..b4c3e9a306 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -6343,7 +6343,7 @@ hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks, dim3
  *
  * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported, #hipErrorOutOfMemory
  *
- * @note 3D liner filter isn't supported on GFX90A boards, on which the API @p hipCreateTextureObject will
+ * @note 3D linear filter isn't supported on GFX90A boards, on which the API @p hipCreateTextureObject will
  * return hipErrorNotSupported.
  *
  */