Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMD: parse the architecture as supplied by gcnArchName #11244

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,20 @@
#define GGML_CUDA_CC_VOLTA 700
#define GGML_CUDA_CC_TURING 750
#define GGML_CUDA_CC_AMPERE 800
#define GGML_CUDA_CC_OFFSET_AMD 1000000
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000

// GCN/CNDA, wave size is 64
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x800) // Tonga, Fiji, Polaris, minimum for fast fp16
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300

// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA

#define GGML_CUDA_CC_QY1 210
#define GGML_CUDA_CC_QY2 220
Expand Down
75 changes: 73 additions & 2 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,59 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
#endif
}

#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
int ggml_cuda_parse_id(char devName[]) {
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
// these values are not stable so this is susceptible to breakage
// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
int archMajor = 0x0;
int archMinor = 0x0;
int archNum = GGML_CUDA_CC_OFFSET_AMD;
int archLen = strlen(devName);
char archName[archLen + 1];

// strip leading 'gfx' while copying into our buffer
if (archLen > 3) {
strcpy(archName, &devName[3]);
archLen -= 3;
}

// trim trailing :xnack- or :sramecc- statuses
archLen = strcspn(archName, ":");
archName[archLen] = '\0';

// tease out the version information
if (archLen > 8) {
// versions labeled generic use '-' as delimiter
// strip the trailing "-generic" then iterate through what remains
if ((strstr(archName, "-generic"))) {
archName[archLen - 8] = '\0';
char * pch;
if ((pch = strtok(archName, "-"))) {
archMajor = (int)strtoul(pch, 0, 16);
if ((pch = strtok(NULL, "-"))) {
archMinor = 0x10 * (int)strtoul(pch, 0, 16);
}
}
}
} else if (archLen >= 3) {
// last two digits should be the minor * 0x10 + stepping
archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
archName[archLen - 2] = '\0';

// only the major version remains
archMajor = (int)strtoul(archName, 0, 16);
}
archNum += archMajor * 0x100;

// be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
if (archMajor != 8) {
archNum += archMinor;
}
return archNum;
}
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)

static ggml_cuda_device_info ggml_cuda_init() {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
Expand Down Expand Up @@ -169,7 +222,6 @@ static ggml_cuda_device_info ggml_cuda_init() {

cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");

info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
Expand All @@ -178,10 +230,29 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;

info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
if ((info.devices[id].cc & 0xff00) == 0x0) {
GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
id, prop.name, prop.gcnArchName, prop.major, prop.minor);

// Fallback to prop.major and prop.minor
if (prop.major > 0) {
info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;

// be inclusive of the full gfx8 line for backward compatibility (Carrizu APUs, etc.)
if (prop.minor != 8) {
info.devices[id].cc += prop.minor * 0x10;
}
}
}
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
}

Expand Down
107 changes: 107 additions & 0 deletions scripts/fetch-amd-ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/env python3
import _io
import re
import os
import sys
from datetime import date
from pathlib import Path
from urllib import request
from urllib.request import urlopen

reUrl = re.compile('^(http(s|)://)(www.|)[a-zA-Z0-9.]*/.*$')
reSupportedIsas = re.compile('.*static constexpr Isa supportedIsas_.*')
reTarget = re.compile('.*{([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*)},.*')

src = "https://raw.githubusercontent.com/ROCm/clr/refs/heads/amd-staging/rocclr/device/device.cpp"
srcType = 'url'

targets = []

def parse(items):
assert(type(items) == list )

depth = 0
i = 0
for line in items:
i += 1
line = str(line.encode("utf-8"))

if re.match(reSupportedIsas, line):
depth += 1
continue

if depth:
for char in line:
if char == '}':
depth -= 1
if depth < 1:
break
elif char == '{':
depth += 1

if depth < 1:
break

if re.match(reTarget, line):
itms = reTarget.split(line)
targets.append((itms[1].strip(' "'),itms[5].strip(' '),itms[6].strip(' '),itms[7].strip(' ')))


if __name__ == '__main__':
buffer=""

if len(sys.argv) > 1:
src = sys.argv[1]
if re.fullmatch(reUrl, src):
srcType = 'url'

else:
srcType = 'file'
if not os.path.exists(src):
raise FileNotFoundError

_src = Path(src)
if not _src.exists():
raise FileNotFoundError

if srcType == "url":
urlreq = request.Request(src)
data = urlopen(urlreq)
buffer = str(data.read().decode("utf-8"))

parse(buffer.splitlines())
else:
try:
num_lines = -1
with open(_src, 'r') as fileIn:
buffer = fileIn.readlines()

parse(buffer)

except Exception as exception:
print(exception)
finally:
if isinstance(fileIn, _io.TextIOWrapper) and not fileIn.close:
fileIn.close()

if len(targets) == 0:
print(f'No items found in {src}!', file=sys.stderr)
exit(1)

i = 0
print(f'struct target '"{")
print(f' char id[256];')
print(f' char major;')
print(f' char minor;')
print(f' char step;')
print("};")
print('')
print(f'// Automatically generated on {date.today()} from "{src}"')
print(f'struct target targets[{len(targets)}];')
for itm in targets:
assert(type(itm) == tuple)
print(f'strcpy(targets[{i}].id, "{itm[0]}");')
print(f'targets[{i}].major = {itm[1]};')
print(f'targets[{i}].minor = {itm[2]};')
print(f'targets[{i}].step = {itm[3]};')
i += 1
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ if (NOT GGML_BACKEND_DL)
llama_target_and_test(test-rope.cpp)
endif()

# llama_target_and_test(test-parse-amd-ids.c)

# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
Expand Down
Loading
Loading