AMD: parse the architecture as supplied by gcnArchName

The value provided by minor is truncated for AMD, parse the value returned by gcnArchName instead to retrieve an accurate ID. We can also use the common value for GCN4, as gfx800, to avoid missing compatible devices.
ggerganov · Jan 18, 2025 · f77ea24 · f77ea24
1 parent 3edfa7d
commit f77ea24
Show file tree

Hide file tree

Showing 5 changed files with 796 additions and 12 deletions.
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -46,20 +46,20 @@
 #define GGML_CUDA_CC_VOLTA      700
 #define GGML_CUDA_CC_TURING     750
 #define GGML_CUDA_CC_AMPERE     800
-#define GGML_CUDA_CC_OFFSET_AMD 1000000
+#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
 
 // GCN/CNDA, wave size is 64
-#define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 803)  // Tonga, Fiji, Polaris, minimum for fast fp16
-#define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 900)  // Vega56/64, minimum for fp16 dual issue
-#define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 906)  // MI50/Radeon VII, minimum for dp4a
-#define GGML_CUDA_CC_CDNA       (GGML_CUDA_CC_OFFSET_AMD + 908)  // MI100, minimum for MFMA, acc registers
-#define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 910)  // MI210, minimum acc register renameing
-#define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 942)  // MI300
+#define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 0x800)  // Tonga, Fiji, Polaris, minimum for fast fp16
+#define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 0x900)  // Vega56/64, minimum for fp16 dual issue
+#define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 0x906)  // MI50/Radeon VII, minimum for dp4a
+#define GGML_CUDA_CC_CDNA       (GGML_CUDA_CC_OFFSET_AMD + 0x908)  // MI100, minimum for MFMA, acc registers
+#define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
+#define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300
 
 // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
-#define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
-#define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
-#define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
+#define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
+#define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
 
 #define GGML_CUDA_CC_QY1        210
 #define GGML_CUDA_CC_QY2        220

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -119,6 +119,59 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 #endif
 }
 
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+int ggml_cuda_parse_id(char devName[]) {
+    // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
+    // these values are not stable so this is susceptible to breakage
+    // https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
+    int archMajor = 0x0;
+    int archMinor = 0x0;
+    int archNum = GGML_CUDA_CC_OFFSET_AMD;
+    int archLen = strlen(devName);
+    char archName[archLen + 1];
+
+    // strip leading 'gfx' while copying into our buffer
+    if (archLen > 3) {
+        strcpy(archName, &devName[3]);
+        archLen -= 3;
+    }
+
+    // trim trailing :xnack- or :sramecc- statuses
+    archLen = strcspn(archName, ":");
+    archName[archLen] = '\0';
+
+    // tease out the version information
+    if (archLen > 8) {
+        // versions labeled generic use '-' as delimiter
+        // strip the trailing "-generic" then iterate through what remains
+        if ((strstr(archName, "-generic"))) {
+            archName[archLen - 8] = '\0';
+            char * pch;
+            if ((pch = strtok(archName, "-"))) {
+                archMajor = (int)strtoul(pch, 0, 16);
+                if ((pch = strtok(NULL, "-"))) {
+                    archMinor = 0x10 * (int)strtoul(pch, 0, 16);
+                }
+            }
+        }
+    } else if (archLen >= 3) {
+        // last two digits should be the minor * 0x10 + stepping
+        archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
+        archName[archLen - 2] = '\0';
+
+        // only the major version remains
+        archMajor = (int)strtoul(archName, 0, 16);
+    }
+    archNum += archMajor * 0x100;
+
+    // be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
+    if (archMajor != 8) {
+       archNum += archMinor;
+    }
+    return archNum;
+}
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+
 static ggml_cuda_device_info ggml_cuda_init() {
 #ifdef __HIP_PLATFORM_AMD__
     // Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -169,7 +222,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
         cudaDeviceProp prop;
         CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
 
         info.default_tensor_split[id] = total_vram;
         total_vram += prop.totalGlobalMem;
@@ -178,10 +230,29 @@ static ggml_cuda_device_info ggml_cuda_init() {
         info.devices[id].smpb  = prop.sharedMemPerBlock;
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
         info.devices[id].smpbo = prop.sharedMemPerBlock;
-        info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
+
+        info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
+        if ((info.devices[id].cc & 0xff00) == 0x0) {
+            GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s  cc %d.%d\n",
+                            id, prop.name, prop.gcnArchName, prop.major, prop.minor);
+
+            // Fallback to prop.major and prop.minor
+            if (prop.major > 0) {
+                info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
+
+                // be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
+                if (prop.minor != 8) {
+                    info.devices[id].cc += prop.minor * 0x10;
+                }
+            }
+        }
+        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s\n",
+                        id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
 #else
         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
     }
 

diff --git a/scripts/fetch-amd-ids.py b/scripts/fetch-amd-ids.py
@@ -0,0 +1,107 @@
+#!/bin/env python3
+import _io
+import re
+import os
+import sys
+from datetime import date
+from pathlib import Path
+from urllib import request
+from urllib.request import urlopen
+
+reUrl = re.compile('^(http(s|)://)(www.|)[a-zA-Z0-9.]*/.*$')
+reSupportedIsas = re.compile('.*static constexpr Isa supportedIsas_.*')
+reTarget = re.compile('.*{([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*)},.*')
+
+src = "https://raw.githubusercontent.com/ROCm/clr/refs/heads/amd-staging/rocclr/device/device.cpp"
+srcType = 'url'
+
+targets = []
+
+def parse(items):
+    assert(type(items) == list )
+
+    depth = 0
+    i = 0
+    for line in items:
+        i += 1
+        line = str(line.encode("utf-8"))
+
+        if re.match(reSupportedIsas, line):
+            depth += 1
+            continue
+
+        if depth:
+            for char in line:
+                if char == '}':
+                    depth -= 1
+                    if depth < 1:
+                        break
+                elif char == '{':
+                    depth += 1
+
+            if depth < 1:
+                break
+
+            if re.match(reTarget, line):
+                itms = reTarget.split(line)
+                targets.append((itms[1].strip(' "'),itms[5].strip(' '),itms[6].strip(' '),itms[7].strip(' ')))
+
+
+if __name__ == '__main__':
+    buffer=""
+
+    if len(sys.argv) > 1:
+        src = sys.argv[1]
+        if re.fullmatch(reUrl, src):
+            srcType = 'url'
+
+        else:
+            srcType = 'file'
+            if not os.path.exists(src):
+                raise FileNotFoundError
+
+            _src = Path(src)
+            if not _src.exists():
+                raise FileNotFoundError
+
+    if srcType == "url":
+        urlreq = request.Request(src)
+        data = urlopen(urlreq)
+        buffer = str(data.read().decode("utf-8"))
+
+        parse(buffer.splitlines())
+    else:
+        try:
+            num_lines = -1
+            with open(_src, 'r') as fileIn:
+                buffer = fileIn.readlines()
+
+            parse(buffer)
+
+        except Exception as exception:
+            print(exception)
+        finally:
+            if isinstance(fileIn, _io.TextIOWrapper) and not fileIn.close:
+                fileIn.close()
+
+    if len(targets) == 0:
+        print(f'No items found in {src}!', file=sys.stderr)
+        exit(1)
+
+    i = 0
+    print(f'struct target '"{")
+    print(f'    char id[256];')
+    print(f'    char major;')
+    print(f'    char minor;')
+    print(f'    char step;')
+    print("};")
+    print('')
+    print(f'// Automatically generated on {date.today()} from "{src}"')
+    print(f'struct target targets[{len(targets)}];')
+    for itm in targets:
+        assert(type(itm) == tuple)
+        print(f'strcpy(targets[{i}].id, "{itm[0]}");')
+        print(f'targets[{i}].major = {itm[1]};')
+        print(f'targets[{i}].minor = {itm[2]};')
+        print(f'targets[{i}].step  = {itm[3]};')
+        i += 1
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -143,6 +143,7 @@ if (NOT GGML_BACKEND_DL)
     llama_target_and_test(test-rope.cpp)
 endif()
 
+# llama_target_and_test(test-parse-amd-ids.c)
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)