Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Support compiler flags for Cortex #2531

Merged
merged 3 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@
# Intel CPUs
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
'SKX', 'KLX', 'CLX', 'CLK', 'SPR',
# AMD CPUs
'AMD',
# ARM CPUs
'AMD', 'ARM', 'AppleArm', 'M1', 'M2', 'M3',
'ARM', 'AppleArm', 'M1', 'M2', 'M3',
'Graviton', 'GRAVITON2', 'GRAVITON3', 'GRAVITON4',
'Cortex',
# Other legacy CPUs
'POWER8', 'POWER9',
# Generic GPUs
Expand Down Expand Up @@ -226,7 +229,7 @@ def homogenise_gpus(gpu_infos):
for i in ['total', 'free', 'used']:
def make_cbk(i):
def cbk(deviceid=0):
info_cmd = ['nvidia-smi', '--query-gpu=memory.%s' % i, '--format=csv']
info_cmd = ['nvidia-smi', f'--query-gpu=memory.{i}', '--format=csv']
proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
raw_info = str(proc.stdout.read())

Expand All @@ -248,7 +251,7 @@ def cbk(deviceid=0):

return cbk

gpu_info['mem.%s' % i] = make_cbk(i)
gpu_info[f'mem.{i}'] = make_cbk(i)

return gpu_info

Expand Down Expand Up @@ -303,10 +306,10 @@ def make_cbk(i):
def cbk(deviceid=0):
try:
# Should only contain Used and total
assert len(info['card%s' % deviceid]) == 2
used = [int(v) for k, v in info['card%s' % deviceid].items()
assert len(info[f'card{deviceid}']) == 2
used = [int(v) for k, v in info[f'card{deviceid}'].items()
if 'Used' in k][0]
total = [int(v) for k, v in info['card%s' % deviceid].items()
total = [int(v) for k, v in info[f'card{deviceid}'].items()
if 'Used' not in k][0]
free = total - used
return {'total': total, 'free': free, 'used': used}[i]
Expand All @@ -318,7 +321,7 @@ def cbk(deviceid=0):

return cbk

gpu_info['mem.%s' % i] = make_cbk(i)
gpu_info[f'mem.{i}'] = make_cbk(i)

gpu_infos['architecture'] = 'AMD'
return gpu_info
Expand Down Expand Up @@ -737,7 +740,7 @@ def numa_domains(self):
try:
return int(lscpu()['NUMA node(s)'])
except (ValueError, TypeError, KeyError):
warning("NUMA domain count autodetection failed")
warning("NUMA domain count autodetection failed, assuming 1")
return 1

@cached_property
Expand Down Expand Up @@ -793,6 +796,21 @@ def march(self):
return 'neoverse-n1'


class Cortex(Arm):

@property
def version(self):
return int(self.name.split('cortexa')[-1])

@cached_property
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this cached, but version is not? Surely the reverse should be true?

I presume it's meant to return 'cortex' or '76'?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is meant to retun 76.

Why is this cached, but version is not? Surely the reverse should be true?

I did not think much of it. I just used the recipe of what already exists in Apple, Graviton etc

def march(self):
return 'armv8-a+crc+simd'

@cached_property
def mtune(self):
return f'cortex-a{self.version}'


class Amd(Cpu64):

known_isas = ('cpp', 'sse', 'avx', 'avx2')
Expand Down Expand Up @@ -1007,6 +1025,8 @@ def march(cls):
M1 = AppleArm('m1')
M2 = AppleArm('m2')
M3 = AppleArm('m3')
CORTEX = Cortex('cortex')
CORTEXA76 = Cortex('cortexa76')

AMD = Amd('amd')

Expand Down
84 changes: 39 additions & 45 deletions devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
call_capture_output as _call_capture_output)

from devito.arch import (AMDGPUX, Cpu64, AppleArm, NvidiaDevice, POWER8, POWER9,
Graviton, IntelDevice, get_nvidia_cc, check_cuda_runtime,
get_m1_llvm_path)
Graviton, Cortex, IntelDevice, get_nvidia_cc,
check_cuda_runtime, get_m1_llvm_path)
from devito.exceptions import CompilationError
from devito.logger import debug, warning
from devito.parameters import configuration
Expand Down Expand Up @@ -45,7 +45,7 @@ def sniff_compiler_version(cc, allow_fail=False):
if allow_fail:
return Version("0")
else:
raise RuntimeError("The `%s` compiler isn't available on this system" % cc)
raise RuntimeError(f"The `{cc}` compiler isn't available on this system")

ver = ver.strip()
if ver.startswith("gcc"):
Expand Down Expand Up @@ -190,8 +190,7 @@ def __init__(self, **kwargs):
self.suffix = kwargs.get('suffix')
if not kwargs.get('mpi'):
self.cc = self.CC if self._cpp is False else self.CXX
self.cc = self.cc if self.suffix is None else ('%s-%s' %
(self.cc, self.suffix))
self.cc = self.cc if self.suffix is None else f'{self.cc}-{self.suffix}'
else:
self.cc = self.MPICC if self._cpp is False else self.MPICXX
self.ld = self.cc # Wanted by the superclass
Expand All @@ -214,7 +213,7 @@ def __init__(self, **kwargs):
elif platform.system() == "Windows":
self.so_ext = '.dll'
else:
raise NotImplementedError("Unsupported platform %s" % platform)
raise NotImplementedError(f"Unsupported platform {platform}")

self.__init_finalize__(**kwargs)

Expand Down Expand Up @@ -291,20 +290,19 @@ def save(self, soname, binary):
"""
sofile = self.get_jit_dir().joinpath(soname).with_suffix(self.so_ext)
if sofile.is_file():
debug("%s: `%s` was not saved in `%s` as it already exists"
% (self, sofile.name, self.get_jit_dir()))
debug(f"{self}: `{sofile.name}` was not saved in `{self.get_jit_dir()}`"
" as it already exists")
else:
makedirs(self.get_jit_dir(), exist_ok=True)
with open(str(sofile), 'wb') as f:
f.write(binary)
debug("%s: `%s` successfully saved in `%s`"
% (self, sofile.name, self.get_jit_dir()))
debug(f"{self}: `{sofile.name}` successfully saved in `{self.get_jit_dir()}`")

def make(self, loc, args):
"""Invoke the ``make`` command from within ``loc`` with arguments ``args``."""
hash_key = sha1((loc + str(args)).encode()).hexdigest()
logfile = path.join(self.get_jit_dir(), "%s.log" % hash_key)
errfile = path.join(self.get_jit_dir(), "%s.err" % hash_key)
logfile = path.join(self.get_jit_dir(), f"{hash_key}.log")
errfile = path.join(self.get_jit_dir(), f"{hash_key}.err")

with change_directory(loc):
with open(logfile, "w") as lf:
Expand All @@ -317,12 +315,12 @@ def make(self, loc, args):
try:
check_call(command, stderr=ef, stdout=lf)
except CalledProcessError as e:
raise CompilationError('Command "%s" return error status %d. '
'Unable to compile code.\n'
'Compile log in %s\n'
'Compile errors in %s\n' %
(e.cmd, e.returncode, logfile, errfile))
debug("Make <%s>" % " ".join(args))
raise CompilationError(f'Command "{e.cmd}" return error status'
f'{e.returncode}. '
f'Unable to compile code.\n'
f'Compile log in {logfile}\n'
f'Compile errors in {errfile}\n')
debug(f"Make <{' '.join(args)}>")

def jit_compile(self, soname, code):
"""
Expand All @@ -340,7 +338,7 @@ def jit_compile(self, soname, code):
The source code to be JIT compiled.
"""
target = str(self.get_jit_dir().joinpath(soname))
src_file = "%s.%s" % (target, self.src_ext)
src_file = f"{target}.{self.src_ext}"

cache_dir = self.get_codepy_dir().joinpath(soname[:7])
if configuration['jit-backdoor'] is False:
Expand All @@ -353,15 +351,15 @@ def jit_compile(self, soname, code):
try:
with open(src_file, 'r') as f:
code = f.read()
code = ''.join([code, '/* Backdoor edit at %s*/ \n' % time.ctime()])
code = f'{code}/* Backdoor edit at {time.ctime()}*/ \n'
# Bypass the devito JIT cache
# Note: can't simply use Python's `mkdtemp()` as, with MPI, different
# ranks would end up creating different cache dirs
cache_dir = cache_dir.joinpath('jit-backdoor')
cache_dir.mkdir(parents=True, exist_ok=True)
except FileNotFoundError:
raise ValueError("Trying to use the JIT backdoor for `%s`, but "
"the file isn't present" % src_file)
raise ValueError(f"Trying to use the JIT backdoor for `{src_file}`, but "
"the file isn't present")

# Should the compilation command be emitted?
debug = configuration['log-level'] == 'DEBUG'
Expand Down Expand Up @@ -392,7 +390,7 @@ def __str__(self):
return self.__class__.__name__

def __repr__(self):
return "JITCompiler[%s]" % self.__class__.__name__
return f"JITCompiler[{self.__class__.__name__}]"

def __getstate__(self):
# The superclass would otherwise only return a subset of attributes
Expand All @@ -406,7 +404,7 @@ def add_library_dirs(self, dirs, rpath=False):
if rpath:
# Add rpath flag to embed library dir
for d in as_list(dirs):
self.ldflags.append('-Wl,-rpath,%s' % d)
self.ldflags.append(f'-Wl,-rpath,{d}')

def add_libraries(self, libs):
self.libraries = filter_ordered(self.libraries + as_list(libs))
Expand Down Expand Up @@ -442,7 +440,10 @@ def __init_finalize__(self, **kwargs):
# -march isn't supported on power architectures, is -mtune needed?
self.cflags = ['-mcpu=native'] + self.cflags
elif isinstance(platform, Graviton):
self.cflags = ['-mcpu=%s' % platform.march] + self.cflags
self.cflags = [f'-mcpu={platform.march}'] + self.cflags
elif isinstance(platform, Cortex):
self.cflags += [f'-march={platform.march}']
self.cflags += [f'-mtune={platform.mtune}']
else:
self.cflags = ['-march=native'] + self.cflags

Expand All @@ -465,14 +466,7 @@ def __lookup_cmds__(self):


class ArmCompiler(GNUCompiler):

def __init_finalize__(self, **kwargs):
GNUCompiler.__init_finalize__(self, **kwargs)
platform = kwargs.pop('platform', configuration['platform'])

# Graviton flag
mloubout marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(platform, Graviton):
self.cflags += ['-mcpu=%s' % platform.march]
pass


class ClangCompiler(Compiler):
Expand All @@ -493,7 +487,7 @@ def __init_finalize__(self, **kwargs):
if language in ['C', 'openmp']:
cc = get_nvidia_cc()
if cc:
self.cflags += ['-Xopenmp-target', '-march=sm_%s' % cc]
self.cflags += ['-Xopenmp-target', f'-march=sm_{cc}']
self.ldflags += ['-fopenmp', '-fopenmp-targets=nvptx64-nvidia-cuda']
elif platform is AMDGPUX:
self.cflags.remove('-std=c99')
Expand All @@ -503,7 +497,7 @@ def __init_finalize__(self, **kwargs):
self.ldflags += ['-fopenmp',
'-fopenmp-targets=amdgcn-amd-amdhsa',
'-Xopenmp-target=amdgcn-amd-amdhsa']
self.ldflags += ['-march=%s' % platform.march]
self.ldflags += [f'-march={platform.march}']
elif isinstance(platform, AppleArm):
# NOTE:
# Apple Mx supports OpenMP through Apple's LLVM compiler.
Expand All @@ -512,9 +506,9 @@ def __init_finalize__(self, **kwargs):
llvmm1 = get_m1_llvm_path(language)
if llvmm1 and language == 'openmp':
mx = platform.march
self.ldflags += ['-mcpu=apple-%s' % mx,
'-fopenmp', '-L%s' % llvmm1['libs']]
self.cflags += ['-Xclang', '-I%s' % llvmm1['include']]
self.ldflags += [f'-mcpu=apple-{mx}',
'-fopenmp', f'-L{llvmm1["libs"]}']
self.cflags += ['-Xclang', f'-I{llvmm1["include"]}']
else:
if platform in [POWER8, POWER9]:
# -march isn't supported on power architectures
Expand Down Expand Up @@ -563,7 +557,7 @@ def __init_finalize__(self, **kwargs):
if language in ['C', 'openmp']:
self.ldflags += ['-target', 'x86_64-pc-linux-gnu']
self.ldflags += ['-fopenmp']
self.ldflags += ['--offload-arch=%s' % platform.march]
self.ldflags += [f'--offload-arch={platform.march}']
elif platform in [POWER8, POWER9]:
# It doesn't make much sense to use AOMP on Power, but it should work
self.cflags.append('-mcpu=native')
Expand Down Expand Up @@ -776,15 +770,15 @@ def __init_intel_mpi__(self, **kwargs):
# whatever the MPI distro is
mpi_distro = sniff_mpi_distro('mpiexec')
if mpi_distro != 'IntelMPI':
warning("Expected Intel MPI distribution with `%s`, but found `%s`"
% (self.__class__.__name__, mpi_distro))
warning(f"Expected Intel MPI distribution with `{self.__class__.__name__}`,"
f"but found `{mpi_distro}`")

def __init_intel_mpi_flags__(self, **kwargs):
self.cflags.insert(0, '-cc=%s' % self.CC)
self.cflags.insert(0, f'-cc={self.CC}')

def get_version(self):
if configuration['mpi']:
cmd = (self.cc, "-cc=%s" % self.CC, "--version")
cmd = (self.cc, f"-cc={self.CC}", "--version")
else:
cmd = (self.cc, "--version")
result, stdout, stderr = call_capture_output(cmd)
Expand All @@ -803,7 +797,7 @@ def __lookup_cmds__(self):
# we try to use `mpiicc` first, while `mpicc` is our fallback, which may
# or may not be an Intel distribution
try:
check_output(["mpiicc", "-cc=%s" % self.CC, "--version"]).decode("utf-8")
check_output(["mpiicc", f"-cc={self.CC}", "--version"]).decode("utf-8")
self.MPICC = 'mpiicc'
self.MPICXX = 'mpicxx'
except FileNotFoundError:
Expand Down Expand Up @@ -905,7 +899,7 @@ def __init_finalize__(self, **kwargs):
elif isinstance(platform, IntelDevice):
self.cflags.append('-fsycl-targets=spir64')
else:
raise NotImplementedError("Unsupported platform %s" % platform)
raise NotImplementedError(f"Unsupported platform {platform}")


class CustomCompiler(Compiler):
Expand Down
Loading