ggerganov · Haus1 · Jan 13, 2025
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -46,20 +46,20 @@
 #define GGML_CUDA_CC_VOLTA      700
 #define GGML_CUDA_CC_TURING     750
 #define GGML_CUDA_CC_AMPERE     800
-#define GGML_CUDA_CC_OFFSET_AMD 1000000
+#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
 
 // GCN/CNDA, wave size is 64
-#define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 803)  // Tonga, Fiji, Polaris, minimum for fast fp16
-#define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 900)  // Vega56/64, minimum for fp16 dual issue
-#define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 906)  // MI50/Radeon VII, minimum for dp4a
-#define GGML_CUDA_CC_CDNA       (GGML_CUDA_CC_OFFSET_AMD + 908)  // MI100, minimum for MFMA, acc registers
-#define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 910)  // MI210, minimum acc register renameing
-#define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 942)  // MI300
+#define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 0x800)  // Tonga, Fiji, Polaris, minimum for fast fp16
+#define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 0x900)  // Vega56/64, minimum for fp16 dual issue
+#define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 0x906)  // MI50/Radeon VII, minimum for dp4a
+#define GGML_CUDA_CC_CDNA       (GGML_CUDA_CC_OFFSET_AMD + 0x908)  // MI100, minimum for MFMA, acc registers
+#define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
+#define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300
 
 // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
-#define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
-#define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
-#define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
+#define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
+#define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
 
 #define GGML_CUDA_CC_QY1        210
 #define GGML_CUDA_CC_QY2        220

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -119,6 +119,59 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 #endif
 }
 
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+int ggml_cuda_parse_id(char devName[]) {
+    // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
+    // these values are not stable so this is susceptible to breakage
+    // https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
+    int archMajor = 0x0;
+    int archMinor = 0x0;
+    int archNum = GGML_CUDA_CC_OFFSET_AMD;
+    int archLen = strlen(devName);
+    char archName[archLen + 1];
+
+    // strip leading 'gfx' while copying into our buffer
+    if (archLen > 3) {
+        strcpy(archName, &devName[3]);
+        archLen -= 3;
+    }
+
+    // trim trailing :xnack- or :sramecc- statuses
+    archLen = strcspn(archName, ":");
+    archName[archLen] = '\0';
+
+    // tease out the version information
+    if (archLen > 8) {
+        // versions labeled generic use '-' as delimiter
+        // strip the trailing "-generic" then iterate through what remains
+        if ((strstr(archName, "-generic"))) {
+            archName[archLen - 8] = '\0';
+            char * pch;
+            if ((pch = strtok(archName, "-"))) {
+                archMajor = (int)strtoul(pch, 0, 16);
+                if ((pch = strtok(NULL, "-"))) {
+                    archMinor = 0x10 * (int)strtoul(pch, 0, 16);
+                }
+            }
+        }
+    } else if (archLen >= 3) {
+        // last two digits should be the minor * 0x10 + stepping
+        archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
+        archName[archLen - 2] = '\0';
+
+        // only the major version remains
+        archMajor = (int)strtoul(archName, 0, 16);
+    }
+    archNum += archMajor * 0x100;
+
+    // be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
+    if (archMajor != 8) {
+       archNum += archMinor;
+    }
+    return archNum;
+}
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+
 static ggml_cuda_device_info ggml_cuda_init() {
 #ifdef __HIP_PLATFORM_AMD__
     // Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -169,7 +222,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
         cudaDeviceProp prop;
         CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
 
         info.default_tensor_split[id] = total_vram;
         total_vram += prop.totalGlobalMem;
@@ -178,10 +230,29 @@ static ggml_cuda_device_info ggml_cuda_init() {
         info.devices[id].smpb  = prop.sharedMemPerBlock;
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
         info.devices[id].smpbo = prop.sharedMemPerBlock;
-        info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
+
+        info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
+        if ((info.devices[id].cc & 0xff00) == 0x0) {
+            GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s  cc %d.%d\n",
+                            id, prop.name, prop.gcnArchName, prop.major, prop.minor);
+
+            // Fallback to prop.major and prop.minor
+            if (prop.major > 0) {
+                info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
+
+                // be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
+                if (prop.minor != 8) {
+                    info.devices[id].cc += prop.minor * 0x10;
+                }
+            }
+        }
+        GGML_LOG_INFO("  Device %d: %s, %s (0x%x), VMM: %s\n",
+                        id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
 #else
         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
+        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s\n",
+                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
     }
 

diff --git a/scripts/fetch-amd-ids.py b/scripts/fetch-amd-ids.py
@@ -0,0 +1,107 @@
+#!/bin/env python3
+import _io
+import re
+import os
+import sys
+from datetime import date
+from pathlib import Path
+from urllib import request
+from urllib.request import urlopen
+
+reUrl = re.compile('^(http(s|)://)(www.|)[a-zA-Z0-9.]*/.*$')
+reSupportedIsas = re.compile('.*static constexpr Isa supportedIsas_.*')
+reTarget = re.compile('.*{([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*)},.*')
+
+src = "https://raw.githubusercontent.com/ROCm/clr/refs/heads/amd-staging/rocclr/device/device.cpp"
+srcType = 'url'
+
+targets = []
+
+def parse(items):
+    assert(type(items) == list )
+
+    depth = 0
+    i = 0
+    for line in items:
+        i += 1
+        line = str(line.encode("utf-8"))
+
+        if re.match(reSupportedIsas, line):
+            depth += 1
+            continue
+
+        if depth:
+            for char in line:
+                if char == '}':
+                    depth -= 1
+                    if depth < 1:
+                        break
+                elif char == '{':
+                    depth += 1
+
+            if depth < 1:
+                break
+
+            if re.match(reTarget, line):
+                itms = reTarget.split(line)
+                targets.append((itms[1].strip(' "'),itms[5].strip(' '),itms[6].strip(' '),itms[7].strip(' ')))
+
+
+if __name__ == '__main__':
+    buffer=""
+
+    if len(sys.argv) > 1:
+        src = sys.argv[1]
+        if re.fullmatch(reUrl, src):
+            srcType = 'url'
+
+        else:
+            srcType = 'file'
+            if not os.path.exists(src):
+                raise FileNotFoundError
+
+            _src = Path(src)
+            if not _src.exists():
+                raise FileNotFoundError
+
+    if srcType == "url":
+        urlreq = request.Request(src)
+        data = urlopen(urlreq)
+        buffer = str(data.read().decode("utf-8"))
+
+        parse(buffer.splitlines())
+    else:
+        try:
+            num_lines = -1
+            with open(_src, 'r') as fileIn:
+                buffer = fileIn.readlines()
+
+            parse(buffer)
+
+        except Exception as exception:
+            print(exception)
+        finally:
+            if isinstance(fileIn, _io.TextIOWrapper) and not fileIn.close:
+                fileIn.close()
+
+    if len(targets) == 0:
+        print(f'No items found in {src}!', file=sys.stderr)
+        exit(1)
+
+    i = 0
+    print(f'struct target '"{")
+    print(f'    char id[256];')
+    print(f'    char major;')
+    print(f'    char minor;')
+    print(f'    char step;')
+    print("};")
+    print('')
+    print(f'// Automatically generated on {date.today()} from "{src}"')
+    print(f'struct target targets[{len(targets)}];')
+    for itm in targets:
+        assert(type(itm) == tuple)
+        print(f'strcpy(targets[{i}].id, "{itm[0]}");')
+        print(f'targets[{i}].major = {itm[1]};')
+        print(f'targets[{i}].minor = {itm[2]};')
+        print(f'targets[{i}].step  = {itm[3]};')
+        i += 1
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -143,6 +143,7 @@ if (NOT GGML_BACKEND_DL)
     llama_target_and_test(test-rope.cpp)
 endif()
 
+# llama_target_and_test(test-parse-amd-ids.c)
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)