Skip to content

Commit

Permalink
AMD: parse the architecture as supplied by gcnArchName
Browse files Browse the repository at this point in the history
The value provided by minor is truncated for AMD, parse the value returned by gcnArchName instead to retrieve an accurate ID.

We can also use the common value for GCN4, as gfx800, to avoid missing compatible devices.
  • Loading branch information
Haus1 committed Jan 18, 2025
1 parent 3edfa7d commit f77ea24
Show file tree
Hide file tree
Showing 5 changed files with 796 additions and 12 deletions.
20 changes: 10 additions & 10 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,20 @@
#define GGML_CUDA_CC_VOLTA 700
#define GGML_CUDA_CC_TURING 750
#define GGML_CUDA_CC_AMPERE 800
#define GGML_CUDA_CC_OFFSET_AMD 1000000
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000

// GCN/CNDA, wave size is 64
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x800) // Tonga, Fiji, Polaris, minimum for fast fp16
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300

// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA

#define GGML_CUDA_CC_QY1 210
#define GGML_CUDA_CC_QY2 220
Expand Down
75 changes: 73 additions & 2 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,59 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
#endif
}

#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
int ggml_cuda_parse_id(char devName[]) {
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
// these values are not stable so this is susceptible to breakage
// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
int archMajor = 0x0;
int archMinor = 0x0;
int archNum = GGML_CUDA_CC_OFFSET_AMD;
int archLen = strlen(devName);
char archName[archLen + 1];

// strip leading 'gfx' while copying into our buffer
if (archLen > 3) {
strcpy(archName, &devName[3]);
archLen -= 3;
}

// trim trailing :xnack- or :sramecc- statuses
archLen = strcspn(archName, ":");
archName[archLen] = '\0';

// tease out the version information
if (archLen > 8) {
// versions labeled generic use '-' as delimiter
// strip the trailing "-generic" then iterate through what remains
if ((strstr(archName, "-generic"))) {
archName[archLen - 8] = '\0';
char * pch;
if ((pch = strtok(archName, "-"))) {
archMajor = (int)strtoul(pch, 0, 16);
if ((pch = strtok(NULL, "-"))) {
archMinor = 0x10 * (int)strtoul(pch, 0, 16);
}
}
}
} else if (archLen >= 3) {
// last two digits should be the minor * 0x10 + stepping
archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
archName[archLen - 2] = '\0';

// only the major version remains
archMajor = (int)strtoul(archName, 0, 16);
}
archNum += archMajor * 0x100;

// be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
if (archMajor != 8) {
archNum += archMinor;
}
return archNum;
}
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)

static ggml_cuda_device_info ggml_cuda_init() {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
Expand Down Expand Up @@ -169,7 +222,6 @@ static ggml_cuda_device_info ggml_cuda_init() {

cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");

info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
Expand All @@ -178,10 +230,29 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;

info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
if ((info.devices[id].cc & 0xff00) == 0x0) {
GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
id, prop.name, prop.gcnArchName, prop.major, prop.minor);

// Fallback to prop.major and prop.minor
if (prop.major > 0) {
info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;

// be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
if (prop.minor != 8) {
info.devices[id].cc += prop.minor * 0x10;
}
}
}
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
}

Expand Down
107 changes: 107 additions & 0 deletions scripts/fetch-amd-ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/env python3
import _io
import re
import os
import sys
from datetime import date
from pathlib import Path
from urllib import request
from urllib.request import urlopen

reUrl = re.compile('^(http(s|)://)(www.|)[a-zA-Z0-9.]*/.*$')
reSupportedIsas = re.compile('.*static constexpr Isa supportedIsas_.*')
reTarget = re.compile('.*{([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*)},.*')

src = "https://raw.githubusercontent.com/ROCm/clr/refs/heads/amd-staging/rocclr/device/device.cpp"
srcType = 'url'

targets = []

def parse(items):
assert(type(items) == list )

depth = 0
i = 0
for line in items:
i += 1
line = str(line.encode("utf-8"))

if re.match(reSupportedIsas, line):
depth += 1
continue

if depth:
for char in line:
if char == '}':
depth -= 1
if depth < 1:
break
elif char == '{':
depth += 1

if depth < 1:
break

if re.match(reTarget, line):
itms = reTarget.split(line)
targets.append((itms[1].strip(' "'),itms[5].strip(' '),itms[6].strip(' '),itms[7].strip(' ')))


if __name__ == '__main__':
buffer=""

if len(sys.argv) > 1:
src = sys.argv[1]
if re.fullmatch(reUrl, src):
srcType = 'url'

else:
srcType = 'file'
if not os.path.exists(src):
raise FileNotFoundError

_src = Path(src)
if not _src.exists():
raise FileNotFoundError

if srcType == "url":
urlreq = request.Request(src)
data = urlopen(urlreq)
buffer = str(data.read().decode("utf-8"))

parse(buffer.splitlines())
else:
try:
num_lines = -1
with open(_src, 'r') as fileIn:
buffer = fileIn.readlines()

parse(buffer)

except Exception as exception:
print(exception)
finally:
if isinstance(fileIn, _io.TextIOWrapper) and not fileIn.close:
fileIn.close()

if len(targets) == 0:
print(f'No items found in {src}!', file=sys.stderr)
exit(1)

i = 0
print(f'struct target '"{")
print(f' char id[256];')
print(f' char major;')
print(f' char minor;')
print(f' char step;')
print("};")
print('')
print(f'// Automatically generated on {date.today()} from "{src}"')
print(f'struct target targets[{len(targets)}];')
for itm in targets:
assert(type(itm) == tuple)
print(f'strcpy(targets[{i}].id, "{itm[0]}");')
print(f'targets[{i}].major = {itm[1]};')
print(f'targets[{i}].minor = {itm[2]};')
print(f'targets[{i}].step = {itm[3]};')
i += 1
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ if (NOT GGML_BACKEND_DL)
llama_target_and_test(test-rope.cpp)
endif()

# llama_target_and_test(test-parse-amd-ids.c)

# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
Expand Down
Loading

0 comments on commit f77ea24

Please sign in to comment.