diff --git a/.clang-format b/.clang-format index 2b240dc46c..b13d9ee87e 100644 --- a/.clang-format +++ b/.clang-format @@ -1,6 +1,7 @@ --- BasedOnStyle: Webkit IndentWidth: 2 +AccessModifierOffset: -2 AlignAfterOpenBracket: Align AlignTrailingComments: true AllowShortBlocksOnASingleLine: true diff --git a/CMakeLists.txt b/CMakeLists.txt index 2469fc472a..ab3f6a215b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ # basic setup for cmake -cmake_minimum_required(VERSION 3.11 FATAL_ERROR) +cmake_minimum_required(VERSION 3.15 FATAL_ERROR) if(POLICY CMP0074) cmake_policy(SET CMP0074 NEW) @@ -9,6 +9,9 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON) set(CMAKE_INCLUDE_DIRECTORIES_PROJECT_BEFORE ON) set(CMAKE_COLOR_MAKEFILE ON) set(CMAKE_CXX_STANDARD_REQUIRED True) +# disable gnu exentions +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CUDA_EXTENSIONS OFF) # disable in source builds this is only a temporary fix, but for now we need it as cmake will otherwise overwrite the # existing makefiles @@ -18,48 +21,27 @@ set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") find_package(Git) +find_package(PythonInterp) -# by default we will build RELASE +# by default we will build DEVEL if(DEFINED ENV{QUDA_BUILD_TYPE}) set(DEFBUILD $ENV{QUDA_BUILD_TYPE}) else() - set(DEFBUILD "DEVEL") + set(DEFBUILD "RELEASE") endif() -if(GIT_FOUND) - execute_process(COMMAND ${GIT_EXECUTABLE} show - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - RESULT_VARIABLE IS_GIT_REPOSIITORY - OUTPUT_QUIET ERROR_QUIET) - if(${IS_GIT_REPOSIITORY} EQUAL 0) - execute_process(COMMAND ${GIT_EXECUTABLE} describe --abbrev=0 - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE GITTAG - OUTPUT_STRIP_TRAILING_WHITESPACE) - # we use git rev-list and pipe that through wc here. Newer git versions support --count as option to rev-list but - # that might not always be available - execute_process(COMMAND ${GIT_EXECUTABLE} rev-list ${GITTAG}..HEAD - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - COMMAND wc -l - OUTPUT_VARIABLE GITCOUNT - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${GIT_EXECUTABLE} describe --long --dirty - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE GITVERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - # ~~~ - # IF(GITCOUNT EQUAL 0) - # SET(DEFBUILD "RELEASE") - # ELSE() - # SET(DEFBUILD "DEVEL") - # ENDIF() - # ~~~ - endif() -endif(GIT_FOUND) - -set(VALID_BUILD_TYPES DEVEL RELEASE STRICT DEBUG HOSTDEBUG DEVICEDEBUG SANITIZE) -set(CMAKE_BUILD_TYPE "${DEFBUILD}" CACHE STRING "Choose the type of build, options are: ${VALID_BUILD_TYPES}") -set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS DEVEL RELEASE STRICT DEBUG HOSTDEBUG DEVICEDEBUG SANITIZE) +set(VALID_BUILD_TYPES + DEVEL + RELEASE + STRICT + DEBUG + HOSTDEBUG + DEVICEDEBUG + SANITIZE) +set(CMAKE_BUILD_TYPE + "${DEFBUILD}" + CACHE STRING "Choose the type of build, options are: ${VALID_BUILD_TYPES}") +set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${VALID_BUILD_TYPES}) string(TOUPPER ${CMAKE_BUILD_TYPE} CHECK_BUILD_TYPE) list(FIND VALID_BUILD_TYPES ${CHECK_BUILD_TYPE} BUILD_TYPE_VALID) @@ -68,10 +50,65 @@ if(BUILD_TYPE_VALID LESS 0) message(SEND_ERROR "Please specify a valid CMAKE_BUILD_TYPE type! Valid build types are:" "${VALID_BUILD_TYPES}") endif() +# Target type +if(DEFINED ENV{QUDA_TARGET}) + set(DEFTARGET $ENV{QUDA_TARGET}) +else() + set(DEFTARGET "CUDA") +endif() + +set(VALID_TARGET_TYPES CUDA HIP) +set(QUDA_TARGET_TYPE + "${DEFTARGET}" + CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}") +set_property(CACHE QUDA_TARGET_TYPE PROPERTY STRINGS CUDA HIP) + +string(TOUPPER ${QUDA_TARGET_TYPE} CHECK_TARGET_TYPE) +list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID) + +if(TARGET_TYPE_VALID LESS 0) + message(SEND_ERROR "Please specify a valid QUDA_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}") +endif() + +if( ${CHECK_TARGET_TYPE} STREQUAL "CUDA") + set(QUDA_TARGET_LIBRARY quda_cuda_target) +endif() + +if( ${CHECK_TARGET_TYPE} STREQUAL "HIP") + set(QUDA_TARGET_LIBRARY quda_hip_target) +endif() # # PROJECT is QUDA # -project("QUDA" VERSION 1.0.0 LANGUAGES) +if(GIT_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} show + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + RESULT_VARIABLE IS_GIT_REPOSIITORY + OUTPUT_QUIET ERROR_QUIET) + if(${IS_GIT_REPOSIITORY} EQUAL 0) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --abbrev=0 + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GITTAG OUTPUT_STRIP_TRAILING_WHITESPACE) + # we use git rev-list and pipe that through wc here. Newer git versions support --count as option to rev-list but + # that might not always be available + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-list ${GITTAG}..HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + COMMAND wc -l + OUTPUT_VARIABLE GITCOUNT OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --long --dirty + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GITVERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() +endif(GIT_FOUND) + +project( + "QUDA" + VERSION 1.1.0 + LANGUAGES) message(STATUS "") message(STATUS "${PROJECT_NAME} ${PROJECT_VERSION} (${GITVERSION}) **") @@ -79,6 +116,7 @@ message(STATUS "cmake version: ${CMAKE_VERSION}") message(STATUS "Source location: ${CMAKE_SOURCE_DIR}") message(STATUS "Build location: ${CMAKE_BINARY_DIR}") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") +message(STATUS "QUDA target: ${QUDA_TARGET_TYPE}") # ###################################################################################################################### # QUDA OPTIONS likely to be changed by users @@ -86,118 +124,173 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") if(DEFINED ENV{QUDA_GPU_ARCH}) set(QUDA_DEFAULT_GPU_ARCH $ENV{QUDA_GPU_ARCH}) else() - set(QUDA_DEFAULT_GPU_ARCH sm_35) + set(QUDA_DEFAULT_GPU_ARCH sm_70) endif() if(NOT QUDA_GPU_ARCH) message(STATUS "Building QUDA for GPU ARCH " "${QUDA_DEFAULT_GPU_ARCH}") endif() -message(STATUS "") set(QUDA_GPU_ARCH ${QUDA_DEFAULT_GPU_ARCH} - CACHE STRING "set the GPU architecture (sm_20, sm_21, sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_70, sm_75)") -set_property(CACHE QUDA_GPU_ARCH PROPERTY STRINGS sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_70 sm_75) + CACHE STRING "set the GPU architecture (sm_35, sm_37, sm_60, sm_70, sm_80)") +set_property(CACHE QUDA_GPU_ARCH PROPERTY STRINGS sm_35 sm_37 sm_60 sm_70 sm_80) # build options -set(QUDA_DIRAC_WILSON ON CACHE BOOL "build Wilson Dirac operators") -set(QUDA_DIRAC_CLOVER ON CACHE BOOL "build clover Dirac operators") -set(QUDA_DIRAC_DOMAIN_WALL ON CACHE BOOL "build domain wall Dirac operators") -set(QUDA_DIRAC_STAGGERED ON CACHE BOOL "build staggered Dirac operators") -set(QUDA_DIRAC_TWISTED_MASS ON CACHE BOOL "build twisted mass Dirac operators") -set(QUDA_DIRAC_TWISTED_CLOVER ON CACHE BOOL "build twisted clover Dirac operators") -set(QUDA_DIRAC_NDEG_TWISTED_MASS OFF CACHE BOOL "build non-degenerate twisted mass Dirac operators") -set(QUDA_FORCE_GAUGE OFF CACHE BOOL "build code for (1-loop Symanzik) gauge force") -set(QUDA_FORCE_HISQ OFF CACHE BOOL "build code for hisq fermion force") -set(QUDA_GAUGE_TOOLS OFF CACHE BOOL "build auxiliary gauge-field tools") -set(QUDA_GAUGE_ALG OFF CACHE BOOL "build gauge-fixing and pure-gauge algorithms") -set(QUDA_CONTRACT OFF CACHE BOOL "build code for bilinear contraction") -set(QUDA_COVDEV OFF CACHE BOOL "build code for covariant derivative") +option(QUDA_DIRAC_DEFAULT_OFF "default value for QUDA_DIRAC_ setting" $ENV{QUDA_DIRAC_DEFAULT_OFF}) +mark_as_advanced(QUDA_DIRAC_DEFAULT_OFF) +if(QUDA_DIRAC_DEFAULT_OFF) + set(QUDA_DIRAC_DEFAULT OFF) +else() + set(QUDA_DIRAC_DEFAULT ON) +endif() + +option(QUDA_DIRAC_WILSON "build Wilson Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_CLOVER "build clover Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_DOMAIN_WALL "build domain wall Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_STAGGERED "build staggered Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_TWISTED_MASS "build twisted mass Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_TWISTED_CLOVER "build twisted clover Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_CLOVER_HASENBUSCH "build clover Hasenbusch twist operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_NDEG_TWISTED_MASS "build non-degenerate twisted mass Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_FORCE_GAUGE "build code for (1-loop Symanzik) gauge force" OFF) +option(QUDA_FORCE_HISQ "build code for hisq fermion force" OFF) +option(QUDA_GAUGE_TOOLS "build auxiliary gauge-field tools" OFF) +option(QUDA_GAUGE_ALG "build gauge-fixing and pure-gauge algorithms" OFF) +option(QUDA_CONTRACT "build code for bilinear contraction" OFF) +option(QUDA_COVDEV "build code for covariant derivative" OFF) +option(QUDA_LAPLACE "build laplace operator" OFF) # Dynamic inversion saves memory but decreases the flops -set(QUDA_DYNAMIC_CLOVER OFF CACHE BOOL "Dynamically invert the clover term for twisted-clover") -set(QUDA_QIO OFF CACHE BOOL "build QIO code for binary I/O") +option(QUDA_DYNAMIC_CLOVER "Dynamically invert the clover term for twisted-clover" OFF) +option(QUDA_QIO "build QIO code for binary I/O" OFF) # Multi-GPU options -set(QUDA_QMP OFF CACHE BOOL "set to 'yes' to build the QMP multi-GPU code") -set(QUDA_MPI OFF CACHE BOOL "set to 'yes' to build the MPI multi-GPU code") +option(QUDA_QMP "build the QMP multi-GPU code" OFF) +option(QUDA_MPI "build the MPI multi-GPU code" OFF) -# BLAS library -set(QUDA_MAGMA OFF CACHE BOOL "build magma interface") +# Magma library +option(QUDA_MAGMA "build magma interface" OFF) # ARPACK -set(QUDA_ARPACK OFF CACHE BOOL "build arpack interface") -set(QUDA_ARPACK_LOGGING OFF CACHE BOOL "enable ARPACK logging (not availible for NG)") +option(QUDA_ARPACK "build arpack interface" OFF) +option(QUDA_ARPACK_LOGGING "enable ARPACK logging (not availible for NG)" OFF) + +# OpenBLAS +option(QUDA_OPENBLAS "enable OpenBLAS" OFF) # Interface options -set(QUDA_INTERFACE_QDP ON CACHE BOOL "build qdp interface") -set(QUDA_INTERFACE_MILC ON CACHE BOOL "build milc interface") -set(QUDA_INTERFACE_CPS OFF CACHE BOOL "build cps interface") -set(QUDA_INTERFACE_QDPJIT OFF CACHE BOOL "build qdpjit interface") -set(QUDA_INTERFACE_BQCD OFF CACHE BOOL "build bqcd interface") -set(QUDA_INTERFACE_TIFR OFF CACHE BOOL "build tifr interface") +option(QUDA_INTERFACE_QDP "build qdp interface" ON) +option(QUDA_INTERFACE_MILC "build milc interface" ON) +option(QUDA_INTERFACE_CPS "build cps interface" OFF) +option(QUDA_INTERFACE_QDPJIT "build qdpjit interface" OFF) +option(QUDA_INTERFACE_BQCD "build bqcd interface" OFF) +option(QUDA_INTERFACE_TIFR "build tifr interface" OFF) +option(QUDA_INTERFACE_ALL "enable all data-orders triggered by the various interfaces" OFF) # QDPJIT -set(QUDA_QDPJIT OFF CACHE BOOL "build QDP-JIT support?") -set(QUDA_QDPJITHOME "" CACHE PATH "path to QDPJIT installation") +option(QUDA_QDPJIT "build QDP-JIT support?" OFF) # Locations for QIO / QMP -set(QUDA_QIOHOME "" CACHE PATH "path to QIO") -set(QUDA_QMPHOME "" CACHE PATH "path to QMP") -set(QUDA_LIMEHOME "" CACHE PATH "path to LIME") -set(QUDA_ARPACK_HOME "" CACHE PATH "path to arpack / parpack") -set(QUDA_MAGMAHOME "" CACHE PATH "path to MAGMA, if not set, pkg-config will be attempted") -set(QUDA_MAGMA_LIBS "" CACHE STRING "additional linker flags required to link against magma") +set(QUDA_QIOHOME + "" + CACHE PATH "path to QIO") +set(QUDA_QMPHOME + "" + CACHE PATH "path to QMP") +set(QUDA_LIMEHOME + "" + CACHE PATH "path to LIME") +set(QUDA_QDPJITHOME + "" + CACHE PATH "path to QDPJIT installation") +set(QUDA_ARPACK_HOME + "" + CACHE PATH "path to arpack / parpack") +set(QUDA_OPENBLAS_HOME + "" + CACHE PATH "path to OpenBLAS") +set(QUDA_MAGMAHOME + "" + CACHE PATH "path to MAGMA, if not set, pkg-config will be attempted") +set(QUDA_MAGMA_LIBS + "" + CACHE STRING "additional linker flags required to link against magma") # ###################################################################################################################### -# QUDA ADVANCED OPTIONS that ususally should not be changed by users +# QUDA ADVANCED OPTIONS that usually should not be changed by users # ###################################################################################################################### -set(QUDA_BUILD_ALL_TESTS ON CACHE BOOL "build tests by default") -if(DEFINED ENV{QUDA_BUILD_SHAREDLIB}) - set(DEFSHARED $ENV{QUDA_BUILD_SHAREDLIB}) -else() - set(DEFSHARED "OFF") +option(QUDA_BUILD_ALL_TESTS "build tests by default" ON) +option(QUDA_INSTALL_ALL_TESTS "install tests by default" ON) +option(QUDA_BUILD_SHAREDLIB "build quda as a shared lib" ON) +option(QUDA_PROPAGATE_CXX_FLAGS "propagate CXX_FLAGS to CUDA host compiler (for cmake >= 3.8)" ON) +option(QUDA_FLOAT8 "enable float-8 ordered fixed-point fields?" ON) +option(QUDA_NVML "use NVML to report CUDA graphics driver version" OFF) +option(QUDA_NUMA_NVML "experimental use of NVML to set numa affinity" OFF) +option(QUDA_VERBOSE_BUILD "display kernel register usage" OFF) +option(QUDA_BUILD_NATIVE_LAPACK "build the native blas/lapack library according to QUDA_TARGET" ON) + +set(QUDA_MAX_MULTI_BLAS_N + "4" + CACHE STRING "maximum value to initialize template for multi-blas /-reduce") +if(QUDA_MAX_MULTI_BLAS_N GREATER 32) + message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.") endif() -set(QUDA_BUILD_SHAREDLIB ${DEFSHARED} CACHE BOOL "build quda as a shared lib") -set(QUDA_PROPAGATE_CXX_FLAGS ON CACHE BOOL "propagate CXX_FLAGS to CUDA host compiler (for cmake >= 3.8)") -set(QUDA_TEX ON CACHE BOOL "enable texture reads?") -set(QUDA_NVML OFF CACHE BOOL "use NVML to report CUDA graphics driver version") -set(QUDA_NUMA_NVML OFF CACHE BOOL "experimental use of NVML to set numa affinity") -set(QUDA_VERBOSE_BUILD OFF CACHE BOOL "display kernel register useage") -set(QUDA_MAX_MULTI_BLAS_N "4" CACHE STRING "maximum value to initialize template for multi-blas /-reduce") set(QUDA_PRECISION "14" - CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, char)") -set(QUDA_RECONSTRUCT "7" CACHE STRING "which reconstructs to instantiate in QUDA (3-bit number - 18, 13/12, 9/8)") + CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)") +set(QUDA_RECONSTRUCT + "7" + CACHE STRING "which reconstructs to instantiate in QUDA (3-bit number - 18, 13/12, 9/8)") + +set(QUDA_NVSHMEM OFF CACHE BOOL "set to 'yes' to build the NVSHMEM multi-GPU code") +set(QUDA_NVSHMEM_HOME $ENV{NVSHMEM_HOME} CACHE PATH "path to NVSHMEM") # Set CTest options -set(QUDA_CTEST_SEP_DSLASH_POLICIES - OFF - CACHE BOOL "Test Dslash policies separately in ctest instead of only autotuning them.") +option(QUDA_CTEST_SEP_DSLASH_POLICIES "Test Dslash policies separately in ctest instead of only autotuning them." OFF) +option(QUDA_CTEST_DISABLE_BENCHMARKS "Disable benchmark test" ON) + +option(QUDA_FAST_COMPILE_REDUCE "enable fast compilation in blas and reduction kernels (single warp per reduction)" OFF) +option(QUDA_FAST_COMPILE_DSLASH "enable fast compilation in dslash kernels (~20% perf impact)" OFF) + +option(QUDA_OPENMP "enable OpenMP" OFF) +set(QUDA_CXX_STANDARD + 14 + CACHE STRING "set the CXX Standard (14 or 17)") +set_property(CACHE QUDA_CXX_STANDARD PROPERTY STRINGS 14 17) + +option(QUDA_BACKWARDS "Enable stacktrace generation using backwards-cpp") -set(QUDA_OPENMP OFF CACHE BOOL "enable OpenMP") -set(QUDA_CXX_STANDARD 11 CACHE STRING "set the CXX Standard (11 or 14)") -set_property(CACHE QUDA_CXX_STANDARD PROPERTY STRINGS 11 14) +# QIO legacy support +option(QUDA_DOWNLOAD_QIO_LEGACY "download qio2-5-0 branch of QIO instead of extended (API compatible) branch" OFF) # NVTX options -set(QUDA_MPI_NVTX OFF CACHE BOOL "add nvtx markup to MPI API calls for the visual profiler") -set(QUDA_INTERFACE_NVTX OFF CACHE BOOL "add nvtx markup to interface calls for the visual profiler") +option(QUDA_MPI_NVTX "add NVTX markup to MPI API calls" OFF) +option(QUDA_INTERFACE_NVTX "add NVTC markup to interface calls" OFF) # features in development -set(QUDA_SSTEP OFF CACHE BOOL "build s-step linear solvers") -set(QUDA_MULTIGRID OFF CACHE BOOL "build multigrid solvers") -set(QUDA_BLOCKSOLVER OFF CACHE BOOL "build block solvers") -set(QUDA_USE_EIGEN OFF CACHE BOOL "use EIGEN library (where optional)") -set(QUDA_DOWNLOAD_EIGEN ON CACHE BOOL "Download Eigen") -set(QUDA_DOWNLOAD_USQCD OFF CACHE BOOL "Download USQCD software as requested by QUDA_QMP / QUDA_QIO") -set(QUDA_DOWNLOAD_ARPACK OFF CACHE BOOL "Download ARPACK-NG software as requested by QUDA_ARPACK") +option(QUDA_SSTEP "build s-step linear solvers" OFF) +option(QUDA_MULTIGRID "build multigrid solvers" OFF) +option(QUDA_BLOCKSOLVER "build block solvers" OFF) +option(QUDA_USE_EIGEN "use EIGEN library (where optional)" ON) +option(QUDA_DOWNLOAD_EIGEN "Download Eigen" ON) +option(QUDA_DOWNLOAD_USQCD "Download USQCD software as requested by QUDA_QMP / QUDA_QIO" OFF) +option(QUDA_DOWNLOAD_ARPACK "Download ARPACK-NG software as requested by QUDA_ARPACK" OFF) +option(QUDA_DOWNLOAD_OPENBLAS "Download OpenBLAS software as requested by QUDA_OPENBLAS" OFF) +option(QUDA_JITIFY "build QUDA using Jitify" OFF) +option(QUDA_DOWNLOAD_NVSHMEM "Download NVSHMEM" OFF) + +set(QUDA_GDRCOPY_HOME "/usr/local/gdrcopy" CACHE STRING "path to gdrcopy used when QUDA_DOWNLOAD_NVSHMEM is enabled") -option(QUDA_GENERATE_DOXYGEN "generate doxygen documentation") -set(QUDA_JITIFY OFF CACHE BOOL "build QUDA using Jitify") +option(QUDA_GENERATE_DOXYGEN "generate doxygen documentation") +# mark as advanced mark_as_advanced(QUDA_BUILD_ALL_TESTS) -mark_as_advanced(QUDA_BUILD_SHAREDLIB) +mark_as_advanced(QUDA_INSTALL_ALL_TESTS) mark_as_advanced(QUDA_PROPAGATE_CXX_FLAGS) -mark_as_advanced(QUDA_TEX) +mark_as_advanced(QUDA_HETEROGENEOUS_ATOMIC) +mark_as_advanced(QUDA_FLOAT8) +mark_as_advanced(QUDA_FAST_COMPILE_REDUCE) +mark_as_advanced(QUDA_FAST_COMPILE_DSLASH) mark_as_advanced(QUDA_NVML) mark_as_advanced(QUDA_NUMA_NVML) mark_as_advanced(QUDA_VERBOSE_BUILD) @@ -209,79 +302,50 @@ mark_as_advanced(QUDA_CTEST_LAUNCH) mark_as_advanced(QUDA_CTEST_LAUNCH_ARGS) mark_as_advanced(QUDA_OPENMP) +mark_as_advanced(QUDA_BACKWARDS) + +mark_as_advanced(QUDA_DOWNLOAD_QIO_LEGACY) + +mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM) +mark_as_advanced(QUDA_DOWNLOAD_NVSHMEM_TAR) +mark_as_advanced(QUDA_GDRCOPY_HOME) mark_as_advanced(QUDA_MPI_NVTX) mark_as_advanced(QUDA_INTERFACE_NVTX) +mark_as_advanced(QUDA_INTERFACE_ALL) mark_as_advanced(QUDA_SSTEP) mark_as_advanced(QUDA_USE_EIGEN) -mark_as_advanced(QUDA_BLOCKSOVER) +mark_as_advanced(QUDA_BLOCKSOLVER) mark_as_advanced(QUDA_CXX_STANDARD) mark_as_advanced(QUDA_JITIFY) mark_as_advanced(QUDA_ARPACK_LOGGING) -# ###################################################################################################################### -# everything below here is processing the setup -# ###################################################################################################################### -# we need to check for some packages -find_package(PythonInterp) +# some checks for invalid combinations -# ###################################################################################################################### -# QUDA depends on Eigen this part makes sure we can download eigen if it is not found -if(QUDA_DOWNLOAD_EIGEN) - set(EIGEN_VERSION 3.3.7) - - set(EIGEN_DOWNLOAD_LOCATION ${CMAKE_SOURCE_DIR}/externals/eigen/${EIGEN_VERSION}.tar.bz2) - set(EIGEN_URL http://bitbucket.org/eigen/eigen/get/${EIGEN_VERSION}.tar.bz2) - set(EIGEN_SHA 9f13cf90dedbe3e52a19f43000d71fdf72e986beb9a5436dddcd61ff9d77a3ce) - if(NOT EXISTS ${EIGEN_DOWNLOAD_LOCATION}) - message(STATUS "Checking for Eigen tarball and downloading if necessary.") - endif() - file(DOWNLOAD ${EIGEN_URL} ${EIGEN_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${EIGEN_SHA} STATUS EIGEN_DOWNLOADED) - list(GET EIGEN_DOWNLOADED 0 EIGEN_DOWNLOADED_CODE) - list(GET EIGEN_DOWNLOADED 1 EIGEN_DOWNLOADED_MSG) - if(${EIGEN_DOWNLOADED_CODE}) - message( - SEND_ERROR - "Could not download Eigen automatically (${EIGEN_DOWNLOADED_MSG}). Please download eigen from ${EIGEN_URL} and save it to ${EIGEN_DOWNLOAD_LOCATION} and try running cmake again." - ) - endif() +if(QUDA_MPI AND QUDA_QMP) + message( + SEND_ERROR + "Specifying QUDA_QMP and QUDA_MPI might result in undefined behavior. If you intend to use QMP set QUDA_MPI=OFF.") +endif() - include(ExternalProject) - ExternalProject_Add(Eigen - URL ${CMAKE_SOURCE_DIR}/externals/eigen/${EIGEN_VERSION}.tar.bz2 - URL_HASH SHA256=${EIGEN_SHA} - PREFIX ${CMAKE_CURRENT_BINARY_DIR}/externals/eigen/ - CONFIGURE_COMMAND "" - BUILD_COMMAND - COMMAND "" - INSTALL_COMMAND "") - ExternalProject_Get_Property(Eigen source_dir) - set(EIGEN_INCLUDE_DIRS ${source_dir}) -else() - # fall back to using find_package - find_package(Eigen QUIET) - if(NOT EIGEN_FOUND) - message( - FATAL_ERROR - "QUDA requires Eigen (http://eigen.tuxfamily.org). Please either set EIGEN_INCLUDE_DIRS to path to eigen3 include directory, e.g. /usr/local/include/eigen3 or set QUDA_DOWNLOAD_EIGEN to ON to enable automatic download of the necessary components." - ) - endif() +if(QUDA_NVSHMEM AND NOT (QUDA_QMP OR QUDA_MPI)) +message( + SEND_ERROR + "Specifying QUDA_NVSHMEM requires either QUDA_QMP or QUDA_MPI.") endif() -include_directories(SYSTEM ${EIGEN_INCLUDE_DIRS}) -# Now we hopefully found some way to get eigen to work -# Linux: CMAKE_HOST_SYSTEM_PROCESSOR "x86_64" Mac: CMAKE_HOST_SYSTEM_PROCESSOR "x86_64" Power: +# COMPILER FLAGS Linux: CMAKE_HOST_SYSTEM_PROCESSOR "x86_64" Mac: CMAKE_HOST_SYSTEM_PROCESSOR "x86_64" Power: # CMAKE_HOST_SYSTEM_PROCESSOR "ppc64le" # We need to use different optimization flags depending on whether we are on x86 or power Note: This only applies to the -# RELASE build type this is just a quick fix and we should probably use +# RELEASE build type this is just a quick fix and we should probably use # https://cmake.org/cmake/help/latest/module/CheckCXXCompilerFlag.html set(CPU_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR}) if(${CPU_ARCH} STREQUAL "x86_64") - set(CXX_OPT "-march=native") + set(CXX_OPT "-mtune=native") elseif(${CPU_ARCH} STREQUAL "ppc64le") set(CXX_OPT "-Ofast -mcpu=native -mtune=native") endif() @@ -289,55 +353,197 @@ endif() set(CMAKE_CXX_STANDARD ${QUDA_CXX_STANDARD}) # define CXX FLAGS set(CMAKE_CXX_FLAGS_DEVEL - "${OpenMP_CXX_FLAGS} -g -O3 -Wall ${CLANG_FORCE_COLOR}" + "-g -O3 -Wall" CACHE STRING "Flags used by the C++ compiler during regular development builds.") set(CMAKE_CXX_FLAGS_STRICT - "${OpenMP_CXX_FLAGS} -O3 -Wall -Werror ${CLANG_NOERROR}" + "-O3 -Wall -Werror" CACHE STRING "Flags used by the C++ compiler during strict jenkins builds.") set(CMAKE_CXX_FLAGS_RELEASE - "${OpenMP_CXX_FLAGS} -O3 -w ${CXX_OPT} " + "-O3 -w ${CXX_OPT} " CACHE STRING "Flags used by the C++ compiler during release builds.") set(CMAKE_CXX_FLAGS_HOSTDEBUG - "${OpenMP_CXX_FLAGS} -Wall -Wno-unknown-pragmas -g -fno-inline -DHOST_DEBUG ${CLANG_FORCE_COLOR}" + "-Wall -Wno-unknown-pragmas -g -fno-inline" CACHE STRING "Flags used by the C++ compiler during host-debug builds.") set(CMAKE_CXX_FLAGS_DEVICEDEBUG - "${OpenMP_CXX_FLAGS} -Wall -Wno-unknown-pragmas -DDEVICE_DEBUG ${CLANG_FORCE_COLOR}" + "-Wall -Wno-unknown-pragmas" CACHE STRING "Flags used by the C++ compiler during device-debug builds.") set(CMAKE_CXX_FLAGS_DEBUG - "${OpenMP_CXX_FLAGS} -Wall -Wno-unknown-pragmas -g -fno-inline -DHOST_DEBUG -DDEVICE_DEBUG ${CLANG_FORCE_COLOR}" + "-Wall -Wno-unknown-pragmas -g -fno-inline" CACHE STRING "Flags used by the C++ compiler during full (host+device) debug builds.") -set( - CMAKE_CXX_FLAGS_SANITIZE - "${OpenMP_CXX_FLAGS} -Wall -Wno-unknown-pragmas -g -fno-inline -DHOST_DEBUG -fsanitize=address,undefined ${CLANG_FORCE_COLOR}" - CACHE STRING "Flags used by the C++ compiler during santizer debug builds.") +set(CMAKE_CXX_FLAGS_SANITIZE + "-Wall -Wno-unknown-pragmas -g -fno-inline -fsanitize=address,undefined" + CACHE STRING "Flags used by the C++ compiler during santizer debug builds.") enable_language(CXX) # define C FLAGS -set(CMAKE_C_FLAGS_DEVEL "-Wall -g -O3" CACHE STRING "Flags used by the C compiler during regular development builds.") +set(CMAKE_C_FLAGS_DEVEL + "-Wall -g -O3" + CACHE STRING "Flags used by the C compiler during regular development builds.") set(CMAKE_C_FLAGS_STRICT - "-Wall -O3 -Werror -Wno-error=unused-private-field" + "-Wall -O3 -Werror" CACHE STRING "Flags used by the C compiler during strict jenkins builds.") -set(CMAKE_C_FLAGS_RELEASE "-Wall -O3 -w" CACHE STRING "Flags used by the C compiler during release builds.") +set(CMAKE_C_FLAGS_RELEASE + "-Wall -O3" + CACHE STRING "Flags used by the C compiler during release builds.") set(CMAKE_C_FLAGS_HOSTDEBUG - "-Wall -Wno-unknown-pragmas -g -fno-inline -DHOST_DEBUG" + "-Wall -Wno-unknown-pragmas -g -fno-inline" CACHE STRING "Flags used by the C compiler during host-debug builds.") -set(CMAKE_C_FLAGS_DEVICEDEBUG "-Wall -DDEVICE_DEBUG" CACHE STRING "Flags used by the C compiler during device-debug builds.") +set(CMAKE_C_FLAGS_DEVICEDEBUG + "-Wall -Wno-unknown-pragmas" + CACHE STRING "Flags used by the C compiler during device-debug builds.") set(CMAKE_C_FLAGS_DEBUG - "-Wall -g -fno-inline -DHOST_DEBUG -DDEVICE_DEBUG" + "-Wall -Wno-unknown-pragmas -g -fno-inline" CACHE STRING "Flags used by the C compiler during full (host+device) debug builds.") set(CMAKE_C_FLAGS_SANITIZE - "-Wall -g -fno-inline -DHOST_DEBUG -fsanitize=address,undefined" + "-Wall -Wno-unknown-pragmas -g -fno-inline -fsanitize=address,undefined" CACHE STRING "Flags used by the C compiler during sanitizer debug builds.") enable_language(C) -# do all the build definitions -# -set(CMAKE_EXE_LINKER_FLAGS_SANITIZE ${CMAKE_EXE_LINKER_FLAGS_SANITIZE} "-fsanitize=address,undefined") +if(QUDA_INTERFACE_TIFR + OR QUDA_INTERFACE_BQCD + OR QUDA_ARPACK + OR QUDA_OPENBLAS) + set(BUILD_FORTRAN_INTERFACE ON) + enable_language(Fortran) +endif() + +# define LINKER FLAGS +set(CMAKE_EXE_LINKER_FLAGS_SANITIZE + "-fsanitize=address,undefined" + CACHE STRING "Flags used by the linker during sanitizer debug builds.") + +# define CUDA flags +set(CMAKE_CUDA_HOST_COMPILER + "${CMAKE_CXX_COMPILER}" + CACHE FILEPATH "Host compiler to be used by nvcc") +set(CMAKE_CUDA_STANDARD ${QUDA_CXX_STANDARD}) +set(CMAKE_CUDA_STANDARD_REQUIRED True) +mark_as_advanced(CMAKE_CUDA_HOST_COMPILER) + +include(CheckLanguage) +check_language(CUDA) + +if(${CMAKE_CUDA_COMPILER} MATCHES "nvcc") + set(QUDA_CUDA_BUILD_TYPE "NVCC") + message(STATUS "CUDA Build Type: ${QUDA_CUDA_BUILD_TYPE}") +endif() + +if(${CMAKE_CUDA_COMPILER} MATCHES "clang") + if(CMAKE_VERSION VERSION_LESS 3.18) + message(ERROR "Building QUDA with clang as CMAKE_CUDA_COMPILER requires CMake 3.18+") + endif() + set(QUDA_CUDA_BUILD_TYPE "Clang") + message(STATUS "CUDA Build Type: ${QUDA_CUDA_BUILD_TYPE}") +endif() + +set(CMAKE_CUDA_FLAGS_DEVEL + "-g -O3 " + CACHE STRING "Flags used by the CUDA compiler during regular development builds.") +set(CMAKE_CUDA_FLAGS_STRICT + "-g -O3" + CACHE STRING "Flags used by the CUDA compiler during strict jenkins builds.") +set(CMAKE_CUDA_FLAGS_RELEASE + "-O3 -w" + CACHE STRING "Flags used by the CUDA compiler during release builds.") +set(CMAKE_CUDA_FLAGS_HOSTDEBUG + "-g" + CACHE STRING "Flags used by the C++ compiler during host-debug builds.") +set(CMAKE_CUDA_FLAGS_DEVICEDEBUG + "-G" + CACHE STRING "Flags used by the C++ compiler during device-debug builds.") +set(CMAKE_CUDA_FLAGS_DEBUG + "-g -G" + CACHE STRING "Flags used by the C++ compiler during full (host+device) debug builds.") +set(CMAKE_CUDA_FLAGS_SANITIZE + "-g " + CACHE STRING "Flags used by the C++ compiler during sanitizer debug builds.") + +# This is needed now GPU ARCH +set(GITVERSION ${GITVERSION}-${QUDA_GPU_ARCH}) +string(REGEX REPLACE sm_ "" COMP_CAP ${QUDA_GPU_ARCH}) +set(CMAKE_CUDA_ARCHITECTURES ${COMP_CAP}) +set(COMP_CAP "${COMP_CAP}0") + +enable_language(CUDA) +message(STATUS "CUDA Compiler is" ${CMAKE_CUDA_COMPILER}) +message(STATUS "Compiler ID is " ${CMAKE_CUDA_COMPILER_ID}) + +# CUDA Wrapper for finding libs etc +if(CMAKE_VERSION VERSION_LESS 3.17) + find_package(CUDAWrapper) +else() + # for cmake 3.17+ we rely on + find_package(CUDAToolkit) +endif() + + +if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0" AND CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA") + set(QUDA_HETEROGENEOUS_ATOMIC_SUPPORT ON) + message(STATUS "Heterogeneous atomics supported: ${QUDA_HETEROGENEOUS_ATOMIC_SUPPORT}") +endif() +include(CMakeDependentOption) +CMAKE_DEPENDENT_OPTION(QUDA_HETEROGENEOUS_ATOMIC "enable heterogeneous atomic support (CUDA >= 11.0)?" ON "QUDA_HETEROGENEOUS_ATOMIC_SUPPORT" OFF) + +if((QUDA_HETEROGENEOUS_ATOMIC OR QUDA_NVSHMEM) AND ${CMAKE_BUILD_TYPE} STREQUAL "SANITIZE") + message(SEND_ERROR "QUDA_HETEROGENEOUS_ATOMIC=ON AND/OR QUDA_NVSHMEM=ON do not support SANITIZE build)") +endif() +if(QUDA_HETEROGENEOUS_ATOMIC AND QUDA_JITIFY) + message(SEND_ERROR "QUDA_HETEROGENEOUS_ATOMIC=ON does not support JITIFY)") +endif() + +if(QUDA_NVSHMEM AND (${COMP_CAP} LESS "700")) + message(SEND_ERROR "QUDA_NVSHMEM=ON requires at least QUDA_GPU_ARCH=sm_70") +endif() + + +# ###################################################################################################################### +# QUDA depends on Eigen this part makes sure we can download eigen if it is not found +if(QUDA_DOWNLOAD_EIGEN) + set(EIGEN_VERSION 3.3.9) + set(EIGEN_DOWNLOAD_LOCATION ${CMAKE_SOURCE_DIR}/externals/eigen/${EIGEN_VERSION}.tar.bz2) + set(EIGEN_URL https://gitlab.com/libeigen/eigen/-/archive/${EIGEN_VERSION}/eigen-${EIGEN_VERSION}.tar.bz2) + set(EIGEN_SHA 0fa5cafe78f66d2b501b43016858070d52ba47bd9b1016b0165a7b8e04675677) + if(NOT EXISTS ${EIGEN_DOWNLOAD_LOCATION}) + message(STATUS "Checking for Eigen tarball and downloading if necessary.") + endif() + file( + DOWNLOAD ${EIGEN_URL} ${EIGEN_DOWNLOAD_LOCATION} + EXPECTED_HASH SHA256=${EIGEN_SHA} + STATUS EIGEN_DOWNLOADED) + list(GET EIGEN_DOWNLOADED 0 EIGEN_DOWNLOADED_CODE) + list(GET EIGEN_DOWNLOADED 1 EIGEN_DOWNLOADED_MSG) + if(${EIGEN_DOWNLOADED_CODE}) + message( + SEND_ERROR + "Could not download Eigen automatically (${EIGEN_DOWNLOADED_MSG}). Please download eigen from ${EIGEN_URL} and save it to ${EIGEN_DOWNLOAD_LOCATION} and try running cmake again." + ) + endif() + + include(ExternalProject) + ExternalProject_Add( + Eigen + URL ${CMAKE_SOURCE_DIR}/externals/eigen/${EIGEN_VERSION}.tar.bz2 + URL_HASH SHA256=${EIGEN_SHA} + PREFIX ${CMAKE_CURRENT_BINARY_DIR}/externals/eigen/ + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "") + ExternalProject_Get_Property(Eigen source_dir) + set(EIGEN_INCLUDE_DIRS ${source_dir}) +else() + # fall back to using find_package + find_package(Eigen QUIET) + if(NOT EIGEN_FOUND) + message( + FATAL_ERROR + "QUDA requires Eigen (http://eigen.tuxfamily.org). Please either set EIGEN_INCLUDE_DIRS to path to eigen3 include directory, e.g. /usr/local/include/eigen3 or set QUDA_DOWNLOAD_EIGEN to ON to enable automatic download of the necessary components." + ) + endif() +endif() if(QUDA_MPI OR QUDA_QMP) - add_definitions(-DMULTI_GPU) # if we are using MPI and no MPI__COMPILER was specified on the command line check for MPICXX and MPICC # environment variables if((NOT MPI_CXX_COMPILER) AND DEFINED ENV{MPICXX}) @@ -359,44 +565,13 @@ if(QUDA_MPI OR QUDA_QMP) if(mpimessage) message( "Found MPIFORT/MPICC/MPICXX environment variables. If this is not what you want please use -DMPI__COMPILER and consult the cmake FindMPI documentation." - ) + ) endif() # we need to enable Fortran if we want to detect MPI_Fortran_COMPILER - if(QUDA_ARPACK) + if(QUDA_ARPACK OR QUDA_OPENBLAS) enable_language(Fortran) endif() find_package(MPI) -else() - set(COMM_OBJS comm_single.cpp) -endif() - -if(QUDA_QDPJIT) - if(NOT QUDA_QMP) - message(WARNING "Specifying QUDA_QDPJIT requires use of QUDA_QMP. Please set QUDA_QMP=ON and set QUDA_QMPHOME.") - endif() - add_definitions(-DUSE_QDPJIT) - include_directories(SYSTEM ${QUDA_QDPJITHOME}/include) - execute_process(COMMAND ${QUDA_QDPJITHOME}/bin/qdp \+\+-config --ldflags - OUTPUT_VARIABLE QDP_LDFLAGS - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${QUDA_QDPJITHOME}/bin/qdp \+\+-config --libs - OUTPUT_VARIABLE QDP_LIBS - OUTPUT_STRIP_TRAILING_WHITESPACE) - find_library(QDP_LIB qdp PATH ${QUDA_QDPJITHOME}/lib) - find_library(QIO_LIB qio ${QUDA_QDPJITHOME}/lib/) - find_library(LIME_LIB lime ${QUDA_QDPJITHOME}/lib/) -endif() - -if(QUDA_MPI AND QUDA_QMP) - message( - WARNING - "Specifying QUDA_QMP and QUDA_MPI might result in undefined behavior. If you intend to use QMP set QUDA_MPI=OFF.") -endif() - -if(QUDA_MPI) - add_definitions(-DMPI_COMMS) - set(COMM_OBJS comm_mpi.cpp) - include_directories(SYSTEM ${MPI_CXX_INCLUDE_PATH}) endif() if(QUDA_DOWNLOAD_USQCD) @@ -405,153 +580,224 @@ if(QUDA_DOWNLOAD_USQCD) find_program(AUTORECONF_EXE NAMES autoreconf) endif() -if(QUDA_QMP) - if(QUDA_DOWNLOAD_USQCD) - ExternalProject_Add(QMP - GIT_REPOSITORY https://github.com/usqcd-software/qmp.git - GIT_TAG qmp2-5-1 - GIT_SHALLOW YES - PREFIX usqcd - CONFIGURE_COMMAND CC=${MPI_C_COMPILER} CXX=${MPI_CXX_COMPILER} /configure - "INSTALL=${INSTALL_EXE} -C" - --with-qmp-comms-type=MPI - --prefix= - BUILD_COMMAND ${MAKE_EXE} - INSTALL_COMMAND ${MAKE_EXE} install +if(QUDA_NVSHMEM) + if(QUDA_DOWNLOAD_NVSHMEM) + # workarounf potential UCX interaction issue with CUDA 11.3+ and UCX in NVSHMEM 2.1.2 + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "11.3") + set(QUDA_DOWNLOAD_NVSHMEM_TAR "https://developer.download.nvidia.com/compute/redist/nvshmem/2.1.2/source/nvshmem_src_2.1.2-0.txz" CACHE STRING "location of NVSHMEM tarball") + else() + set(QUDA_DOWNLOAD_NVSHMEM_TAR "https://developer.download.nvidia.com/compute/redist/nvshmem/2.2.1/source/nvshmem_src_2.2.1-0.txz" CACHE STRING "location of NVSHMEM tarball") + endif() + get_filename_component(NVSHMEM_CUDA_HOME ${CUDAToolkit_INCLUDE_DIRS} DIRECTORY) + find_path(GDRCOPY_HOME NAME gdrapi.h PATHS "/usr/local/gdrcopy" ${QUDA_GDRCOPY_HOME} PATH_SUFFIXES "include") + mark_as_advanced(GDRCOPY_HOME) + if(NOT GDRCOPY_HOME) + message(SEND_ERROR "QUDA_DOWNLOAD_NVSHMEM requires gdrcopy to be installed. Please set QUDA_GDRCOPY_HOME to the location of your gdrcopy installation.") + endif() + get_filename_component(NVSHMEM_GDRCOPY_HOME ${GDRCOPY_HOME} DIRECTORY) + ExternalProject_Add(NVSHMEM + URL ${QUDA_DOWNLOAD_NVSHMEM_TAR} + PREFIX nvshmem + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE ON + BUILD_COMMAND make -j8 MPICC=${MPI_C_COMPILER} CUDA_HOME=${NVSHMEM_CUDA_HOME} NVSHMEM_PREFIX= NVSHMEM_MPI_SUPPORT=1 GDRCOPY_HOME=${NVSHMEM_GDRCOPY_HOME} install + INSTALL_COMMAND "" LOG_INSTALL ON LOG_BUILD ON LOG_DOWNLOAD ON - # LOG_MERGED_STDOUTERR ON - # LOG_OUTPUT_ON_FAILURE ON ) + ExternalProject_Get_Property(NVSHMEM INSTALL_DIR) + set(QUDA_NVSHMEM_HOME ${INSTALL_DIR} CACHE PATH "path to NVSHMEM" FORCE) + set(NVSHMEM_LIBS ${INSTALL_DIR}/lib/libnvshmem.a) + set(NVSHMEM_INCLUDE ${INSTALL_DIR}/include/) + else() + if("${QUDA_NVSHMEM_HOME}" STREQUAL "") + message( FATAL_ERROR "QUDA_NVSHMEM_HOME must be defined if QUDA_NVSHMEM is set" ) + endif() + find_library(NVSHMEM_LIBS NAMES nvshmem PATHS "${QUDA_NVSHMEM_HOME}/lib/" ) + find_path(NVSHMEM_INCLUDE NAMES nvshmem.h PATHS "${QUDA_NVSHMEM_HOME}/include/" ) + endif() + + mark_as_advanced(NVSHMEM_LIBS) + mark_as_advanced(NVSHMEM_INCLUDE) + add_library(nvshmem_lib STATIC IMPORTED) + set_target_properties(nvshmem_lib PROPERTIES IMPORTED_LOCATION ${NVSHMEM_LIBS}) + set_target_properties(nvshmem_lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + set_target_properties(nvshmem_lib PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF) + set_target_properties(nvshmem_lib PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES CUDA) +endif() + + +if(QUDA_QDPJIT) + if(NOT QUDA_QMP) + message(WARNING "Specifying QUDA_QDPJIT requires use of QUDA_QMP. Please set QUDA_QMP=ON and set QUDA_QMPHOME.") + endif() + execute_process(COMMAND ${QUDA_QDPJITHOME}/bin/qdp\+\+-config --ldflags + OUTPUT_VARIABLE QDP_LDFLAGS OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${QUDA_QDPJITHOME}/bin/qdp\+\+-config --libs + OUTPUT_VARIABLE QDP_LIBS OUTPUT_STRIP_TRAILING_WHITESPACE) + find_library(QDP_LIB qdp PATH ${QUDA_QDPJITHOME}/lib) + find_library(QIO_LIB qio ${QUDA_QDPJITHOME}/lib/) + find_library(LIME_LIB lime ${QUDA_QDPJITHOME}/lib/) +endif() + +if(QUDA_QMP) + find_library(QMP_FOUND qmp HINTS ${QUDA_QMPHOME}/lib NO_DEFAULT_PATH) + mark_as_advanced(QMP_FOUND) + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${QMP_FOUND}) + if(QUDA_DOWNLOAD_USQCD AND NOT QMP_FOUND) + ExternalProject_Add( + QMP + GIT_REPOSITORY https://github.com/usqcd-software/qmp.git + GIT_TAG qmp2-5-3 + GIT_SHALLOW YES + PREFIX usqcd + CONFIGURE_COMMAND CC=${MPI_C_COMPILER} CXX=${MPI_CXX_COMPILER} /configure "INSTALL=${INSTALL_EXE} -C" + "CFLAGS=-Wall -O3 -std=c99 " --with-qmp-comms-type=MPI --prefix= + BUILD_COMMAND ${MAKE_EXE} + INSTALL_COMMAND ${MAKE_EXE} install + LOG_INSTALL ON + LOG_BUILD ON + LOG_DOWNLOAD ON) ExternalProject_Get_Property(QMP INSTALL_DIR) - set(QUDA_QMPHOME ${INSTALL_DIR}) + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${INSTALL_DIR}) + set(QUDA_QMPHOME + ${INSTALL_DIR} + CACHE PATH "path to QMP" FORCE) set(QUDA_QMP_LDFLAGS "-L${QUDA_QMPHOME}/lib" CACHE STRING "LDFLAGS for QMP - should be derived from qmp-config --ldflags") - set(QUDA_QMP_LIBS "-lqmp" CACHE STRING "LIBS for QMP - should be derived from qmp-config --libs") - ExternalProject_Add_Step(QMP reconf - COMMAND ${AUTORECONF_EXE} -fi - WORKING_DIRECTORY - DEPENDERS configure - DEPENDEES download) + set(QUDA_QMP_LIBS + "-lqmp" + CACHE STRING "LIBS for QMP - should be derived from qmp-config --libs") + ExternalProject_Add_Step( + QMP reconf + COMMAND ${AUTORECONF_EXE} -fi + WORKING_DIRECTORY + DEPENDERS configure + DEPENDEES download) else() if("${QUDA_QMPHOME}" STREQUAL "") message(FATAL_ERROR "QUDA_QMPHOME must be defined if QUDA_QMP is ON and QUDA_DOWNLOAD_USQCD is OFF") endif() - execute_process(COMMAND ${QUDA_QMPHOME}/bin/qmp-config --cflags - OUTPUT_VARIABLE QUDA_QMP_CFLAGS - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${QUDA_QMPHOME}/bin/qmp-config --ldflags - OUTPUT_VARIABLE QUDA_QMP_LDFLAGS_INTERNAL - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${QUDA_QMPHOME}/bin/qmp-config --libs - OUTPUT_VARIABLE QUDA_QMP_LIBS_INTERNAL - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${QUDA_QMPHOME}/bin/qmp-config --cflags OUTPUT_VARIABLE QUDA_QMP_CFLAGS + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${QUDA_QMPHOME}/bin/qmp-config --ldflags OUTPUT_VARIABLE QUDA_QMP_LDFLAGS_INTERNAL + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${QUDA_QMPHOME}/bin/qmp-config --libs OUTPUT_VARIABLE QUDA_QMP_LIBS_INTERNAL + OUTPUT_STRIP_TRAILING_WHITESPACE) set(QUDA_QMP_LDFLAGS ${QUDA_QMP_LDFLAGS_INTERNAL} CACHE STRING "LDFLAGS for QMP - should be derived from qmp-config --ldflags") - set(QUDA_QMP_LIBS ${QUDA_QMP_LIBS_INTERNAL} CACHE STRING "LIBS for QMP - should be derived from qmp-config --libs") + set(QUDA_QMP_LIBS + ${QUDA_QMP_LIBS_INTERNAL} + CACHE STRING "LIBS for QMP - should be derived from qmp-config --libs") endif() - - add_definitions(-DQMP_COMMS) - - include_directories(SYSTEM ${QUDA_QMPHOME}/include) - include_directories(SYSTEM ${MPI_CXX_INCLUDE_PATH}) - set(COMM_OBJS comm_qmp.cpp) endif() if(QUDA_QIO) if(NOT QUDA_QMP) message(FATAL_ERROR "Use of QIO (via QUDA_QIO=ON) requires QMP. Please set QUDA_QMP=ON.") endif() - if(QUDA_DOWNLOAD_USQCD) - ExternalProject_Add(QIO - GIT_REPOSITORY https://github.com/usqcd-software/qio.git - GIT_TAG qio2-5-0 - GIT_SHALLOW YES - PREFIX usqcd - CONFIGURE_COMMAND CC=${MPI_C_COMPILER} CXX=${MPI_CXX_COMPILER} /configure - "INSTALL=${INSTALL_EXE} -C" - --with-qmp=${QUDA_QMPHOME} - --prefix= - BUILD_COMMAND make - INSTALL_COMMAND make install - DEPENDS QMP - LOG_INSTALL ON - LOG_BUILD ON - LOG_DOWNLOAD ON - # LOG_MERGED_STDOUTERR ON - # LOG_OUTPUT_ON_FAILURE ON - ) + find_library(QIO_FOUND qio HINTS ${QUDA_QIOHOME}/lib NO_DEFAULT_PATH) + mark_as_advanced(QIO_FOUND) + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${QIO_FOUND}) + if(QUDA_DOWNLOAD_USQCD AND NOT QIO_FOUND) + set(QIO_BRANCH + "extended") + if(QUDA_DOWNLOAD_QIO_LEGACY) + set(QIO_BRANCH + "qio2-5-0") + endif() + ExternalProject_Add( + QIO + GIT_REPOSITORY https://github.com/usqcd-software/qio.git + GIT_TAG "${QIO_BRANCH}" + GIT_SHALLOW YES + PREFIX usqcd + CONFIGURE_COMMAND CC=${MPI_C_COMPILER} CXX=${MPI_CXX_COMPILER} /configure "INSTALL=${INSTALL_EXE} -C" + "CFLAGS=-Wall -O3 -std=c99 " --with-qmp=${QUDA_QMPHOME} --prefix= + --enable-largefile --disable-qmp-route --enable-dml-output-buffering --enable-dml-bufsize=33554432 + BUILD_COMMAND make + INSTALL_COMMAND make install + DEPENDS QMP + LOG_INSTALL ON + LOG_BUILD ON + LOG_DOWNLOAD ON) ExternalProject_Get_Property(QIO INSTALL_DIR) - set(QUDA_QIOHOME ${INSTALL_DIR}) - set(QUDA_LIME_HOME ${INSTALL_DIR}) - - ExternalProject_Add_Step(QIO reconf - COMMAND ${AUTORECONF_EXE} -fi - WORKING_DIRECTORY - DEPENDERS configure - DEPENDEES download) + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${INSTALL_DIR}) + set(QUDA_QIOHOME + ${INSTALL_DIR} + CACHE PATH "path to QIO" FORCE) + set(QUDA_LIMEHOME + ${INSTALL_DIR} + CACHE PATH "path to LIME" FORCE) + ExternalProject_Add_Step( + QIO reconf + COMMAND ${AUTORECONF_EXE} -fi + WORKING_DIRECTORY + DEPENDERS configure + DEPENDEES download) set(QUDA_QIO_LDFLAGS "-L${QUDA_QIOHOME}/lib" CACHE STRING "LDFLAGS for QMP - should be derived from qmp-config --ldflags") - set(QUDA_QIO_LIBS "-lqio -llime" CACHE STRING "LIBS for QMP - should be derived from qmp-config --libs") + set(QUDA_QIO_LIBS + "-lqio -llime" + CACHE STRING "LIBS for QMP - should be derived from qmp-config --libs") else() if("${QUDA_QIOHOME}" STREQUAL "" OR "${QUDA_LIMEHOME}" STREQUAL "") message( FATAL_ERROR "QUDA_QIOHOME and QUDA_LIMEHOME must be defined when QUDA_QIO is ON and QUDA_DOWNLOAD_USQCD is OFF") endif() - execute_process(COMMAND ${QUDA_QIOHOME}/bin/qio-config --cflags - OUTPUT_VARIABLE QUDA_QIO_CFLAGS - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${QUDA_QIOHOME}/bin/qio-config --ldflags - OUTPUT_VARIABLE QUDA_QIO_LDFLAGS_INTERNAL - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${QUDA_QIOHOME}/bin/qio-config --libs - OUTPUT_VARIABLE QUDA_QIO_LIBS_INTERNAL - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${QUDA_QIOHOME}/bin/qio-config --cflags OUTPUT_VARIABLE QUDA_QIO_CFLAGS + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${QUDA_QIOHOME}/bin/qio-config --ldflags OUTPUT_VARIABLE QUDA_QIO_LDFLAGS_INTERNAL + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${QUDA_QIOHOME}/bin/qio-config --libs OUTPUT_VARIABLE QUDA_QIO_LIBS_INTERNAL + OUTPUT_STRIP_TRAILING_WHITESPACE) set(QUDA_QIO_LDFLAGS ${QUDA_QIO_LDFLAGS_INTERNAL} CACHE STRING "LDFLAGS for QMP - should be derived from qmp-config --ldflags") - set(QUDA_QIO_LIBS ${QUDA_QIO_LIBS_INTERNAL} CACHE STRING "LIBS for QMP - should be derived from qmp-config --libs") + set(QUDA_QIO_LIBS + ${QUDA_QIO_LIBS_INTERNAL} + CACHE STRING "LIBS for QMP - should be derived from qmp-config --libs") endif() - add_definitions(-DHAVE_QIO) - set(QIO_UTIL qio_util.cpp qio_field.cpp layout_hyper.c) +endif() - include_directories(SYSTEM ${QUDA_QIOHOME}/include) - include_directories(SYSTEM ${QUDA_LIMEHOME}/include) +if(QUDA_OPENMP) + find_package(OpenMP) endif() if(QUDA_MAGMA) - add_definitions(-DMAGMA_LIB -DADD_ -DMAGMA_SETAFFINITY -DGPUSHMEM=300 -DHAVE_CUBLAS -DMAGMA_LIB) - find_package(OpenMP) + add_library(MAGMA::MAGMA INTERFACE IMPORTED) + target_compile_definitions(MAGMA::MAGMA INTERFACE MAGMA_LIB ADD_ MAGMA_SETAFFINITY GPUSHMEM=300 HAVE_CUBLAS) if("${QUDA_MAGMAHOME}" STREQUAL "") find_package(PkgConfig REQUIRED) pkg_check_modules(MAGMA magma) - include_directories(SYSTEM ${MAGMA_INCLUDEDIR}) + target_include_directories(MAGMA::MAGMA SYSTEM INTERFACE ${MAGMA_INCLUDEDIR}) message("${MAGMA_INCLUDEDIR}") find_library(MAGMA ${MAGMA_LIBRARIES} PATH ${MAGMA_LIBRARY_DIRS}) else() # prefer static library find_library(MAGMA libmagma.a magma ${QUDA_MAGMAHOME}/lib/) # append additional libraries required by magma - list(APPEND MAGMA ${CUDA_cublas_LIBRARY}) - list(APPEND MAGMA ${CUDA_cusparse_LIBRARY}) - list(APPEND MAGMA ${QUDA_MAGMA_LIBS}) + target_link_libraries(MAGMA::MAGMA INTERFACE ${CUDA_cublas_LIBRARY}) + target_link_libraries(MAGMA::MAGMA INTERFACE ${CUDA_cusparse_LIBRARY}) + target_link_libraries(MAGMA::MAGMA INTERFACE ${QUDA_MAGMA_LIBS}) # and any additional OpenMP linker flags - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") - include_directories(SYSTEM ${QUDA_MAGMAHOME}/include) + target_include_directories(MAGMA::MAGMA SYSTEM INTERFACE ${QUDA_MAGMAHOME}/include) endif() + target_link_libraries(MAGMA::MAGMA INTERFACE ${MAGMA}) + find_package(OpenMP) + target_link_libraries(MAGMA::MAGMA INTERFACE OpenMP::OpenMP_CXX) endif(QUDA_MAGMA) # This selects arpack or parpack for Multi GPU if(QUDA_ARPACK) enable_language(Fortran) - add_definitions(-DARPACK_LIB) if(QUDA_MPI OR QUDA_QMP) set(ARPACK_MPI ON) @@ -561,46 +807,32 @@ if(QUDA_ARPACK) if(QUDA_DOWNLOAD_ARPACK) include(GNUInstallDirs) - - ExternalProject_Add(ARPACK-NG - GIT_REPOSITORY https://github.com/opencollab/arpack-ng.git - GIT_TAG 3.7.0 - GIT_SHALLOW YES - PREFIX arpack-ng - # CONFIGURE_COMMAND CC=${MPI_C_COMPILER} CXX=${MPI_CXX_COMPILER} /configure --with- - # qmp-comms-type=MPI --prefix= - CMAKE_ARGS -DMPI=${ARPACK_MPI} -DCMAKE_INSTALL_PREFIX= - CMAKE_GENERATOR "Unix Makefiles" - # BUILD_COMMAND make - # INSTALL_COMMAND make install - # LOG_INSTALL ON - # LOG_BUILD ON - # LOG_DOWNLOAD ON - # LOG_MERGED_STDOUTERR ON - # LOG_OUTPUT_ON_FAILURE ON - ) + ExternalProject_Add( + ARPACK-NG + GIT_REPOSITORY https://github.com/opencollab/arpack-ng.git + GIT_TAG 3.7.0 + GIT_SHALLOW YES + PREFIX arpack-ng + CMAKE_ARGS -DMPI=${ARPACK_MPI} -DCMAKE_INSTALL_PREFIX= + CMAKE_GENERATOR "Unix Makefiles") ExternalProject_Get_Property(ARPACK-NG INSTALL_DIR) set(QUDA_ARPACK_HOME ${INSTALL_DIR}) add_library(arpack-ng STATIC IMPORTED) add_dependencies(arpack-ng ARPACK-NG) find_package(BLAS REQUIRED) find_package(LAPACK REQUIRED) - # target_link_libraries(arpack-ng INTERFACE -L${QUDA_ARPACK_HOME}/lib -larpack) target_link_libraries(arpack-ng INTERFACE ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) set_target_properties(arpack-ng PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran) - set_target_properties(arpack-ng - PROPERTIES IMPORTED_LOCATION ${QUDA_ARPACK_HOME}/${CMAKE_INSTALL_LIBDIR}/libarpack.a) + set_target_properties(arpack-ng PROPERTIES IMPORTED_LOCATION + ${QUDA_ARPACK_HOME}/${CMAKE_INSTALL_LIBDIR}/libarpack.a) if(QUDA_MPI OR QUDA_QMP) add_library(parpack-ng STATIC IMPORTED) - # target_link_libraries(arpack-ng INTERFACE -L${QUDA_ARPACK_HOME}/lib -lparpack) target_link_libraries(parpack-ng INTERFACE arpack-ng MPI::MPI_Fortran) set_target_properties(parpack-ng PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran) - set_target_properties(parpack-ng - PROPERTIES IMPORTED_LOCATION ${QUDA_ARPACK_HOME}/${CMAKE_INSTALL_LIBDIR}/libparpack.a) - # include_directories(SYSTEM ${QUDA_ARPACK_HOME}/PARPACK/SRC/MPI) include_directories(SYSTEM - # ${QUDA_ARPACK_HOME}/PARPACK/UTIL/MPI) + set_target_properties(parpack-ng PROPERTIES IMPORTED_LOCATION + ${QUDA_ARPACK_HOME}/${CMAKE_INSTALL_LIBDIR}/libparpack.a) endif() - + else(QUDA_DOWNLOAD_ARPACK) find_package(PkgConfig REQUIRED) @@ -611,7 +843,7 @@ if(QUDA_ARPACK) else() find_library(ARPACK ${ARPACK_LIBRARIES} PATH ${ARPACK_LIBRARY_DIRS}) endif() - + # Link the parallel library if required if(QUDA_MPI OR QUDA_QMP) pkg_check_modules(PARPACK QUIET parpack) @@ -622,254 +854,62 @@ if(QUDA_ARPACK) endif() endif() endif(QUDA_DOWNLOAD_ARPACK) - - if(QUDA_ARPACK_LOGGING) - # ARPACK-NG does not suppport logging - we must warn the user - message(WARNING "Specifying QUDA_ARPACK_LOGGING with ARPACK-NG package will cause link failures. Please ensure that QUDA_ARPACK_LOGGING=OFF if downloading ARPACK-NG or using system installed ARPACK-NG") - add_definitions(-DARPACK_LOGGING) - endif() endif(QUDA_ARPACK) -# set which precisions to enable -add_definitions(-DQUDA_PRECISION=${QUDA_PRECISION}) - -# set which precisions to enable -add_definitions(-DQUDA_RECONSTRUCT=${QUDA_RECONSTRUCT}) - -if(QUDA_SSTEP) - add_definitions(-DSSTEP) -endif() - -if(QUDA_MULTIGRID) - add_definitions(-DCUBLAS_LIB) - add_definitions(-DGPU_MULTIGRID) -endif(QUDA_MULTIGRID) - -if(QUDA_BLOCKSOLVER) - add_definitions(-DBLOCKSOLVER) -endif() - -if(QUDA_JITIFY) - add_definitions(-DJITIFY) - find_package(LibDL) -endif() - -if(QUDA_USE_EIGEN) - add_definitions(-DEIGEN) -endif() - -if(QUDA_DIRAC_WILSON) - add_definitions(-DGPU_WILSON_DIRAC) -endif(QUDA_DIRAC_WILSON) - -if(QUDA_DIRAC_DOMAIN_WALL) - add_definitions(-DGPU_DOMAIN_WALL_DIRAC) -endif(QUDA_DIRAC_DOMAIN_WALL) - -if(QUDA_DIRAC_STAGGERED) - add_definitions(-DGPU_STAGGERED_DIRAC) -endif(QUDA_DIRAC_STAGGERED) - -if(QUDA_DIRAC_CLOVER) - add_definitions(-DGPU_CLOVER_DIRAC -DGPU_WILSON_DIRAC -DGPU_GAUGE_TOOLS) -endif(QUDA_DIRAC_CLOVER) - -if(QUDA_DIRAC_TWISTED_MASS) - add_definitions(-DGPU_TWISTED_MASS_DIRAC -DGPU_WILSON_DIRAC) -endif(QUDA_DIRAC_TWISTED_MASS) - -if(QUDA_DIRAC_TWISTED_CLOVER) - add_definitions(-DGPU_TWISTED_CLOVER_DIRAC -DGPU_CLOVER_DIRAC -DGPU_TWISTED_MASS_DIRAC -DGPU_WILSON_DIRAC - -DGPU_GAUGE_TOOLS) -endif(QUDA_DIRAC_TWISTED_CLOVER) - -if(QUDA_DIRAC_NDEG_TWISTED_MASS) - add_definitions(-DGPU_NDEG_TWISTED_MASS_DIRAC -DGPU_TWISTED_MASS_DIRAC -DGPU_WILSON_DIRAC) -endif(QUDA_DIRAC_NDEG_TWISTED_MASS) - -if(QUDA_DIRAC_STAGGERED) - add_definitions(-DGPU_FATLINK -DGPU_UNITARIZE) -endif(QUDA_DIRAC_STAGGERED) - -if(QUDA_FORCE_GAUGE) - add_definitions(-DGPU_GAUGE_FORCE -DGPU_GAUGE_TOOLS) -endif(QUDA_FORCE_GAUGE) - -if(QUDA_FORCE_HISQ) - add_definitions(-DGPU_HISQ_FORCE -DGPU_STAGGERED_OPROD -DGPU_GAUGE_TOOLS) -endif(QUDA_FORCE_HISQ) - -if(QUDA_GAUGE_TOOLS) - add_definitions(-DGPU_GAUGE_TOOLS) -endif(QUDA_GAUGE_TOOLS) - -if(QUDA_GAUGE_ALG) - add_definitions(-DGPU_GAUGE_ALG) - add_definitions(-DGPU_GAUGE_TOOLS) - add_definitions(-DGPU_UNITARIZE) - list(APPEND QUDA_LIBS ${CUDA_cufft_LIBRARY} ${CUDA_curand_LIBRARY}) -endif(QUDA_GAUGE_ALG) - -if(QUDA_MPI_NVTX) - list(APPEND COMM_OBJS nvtx_pmpi.c) - set(QUDA_NVTX ON) -endif(QUDA_MPI_NVTX) - -if(QUDA_INTERFACE_NVTX) - add_definitions(-DINTERFACE_NVTX) - set(QUDA_NVTX ON) -endif(QUDA_INTERFACE_NVTX) - -if(QUDA_NVTX) - find_path(NVTX3 "nvtx3/nvToolsExt.h" PATHS ${CUDA_TOOLKIT_INCLUDE} NO_DEFAULT_PATH) - if(NVTX3) - add_definitions(-DQUDA_NVTX_VERSION=3) - else(NVTX) - list(APPEND QUDA_LIBS ${CUDA_nvToolsExt_LIBRARY}) - endif(NVTX3) -endif(QUDA_NVTX) - -if(QUDA_INTERFACE_QDP) - add_definitions(-DBUILD_QDP_INTERFACE) -endif(QUDA_INTERFACE_QDP) - -if(QUDA_INTERFACE_MILC) - add_definitions(-DBUILD_MILC_INTERFACE) -endif(QUDA_INTERFACE_MILC) - -if(QUDA_INTERFACE_CPS) - add_definitions(-DBUILD_CPS_INTERFACE) -endif(QUDA_INTERFACE_CPS) - -if(QUDA_INTERFACE_QDPJIT) - add_definitions(-DBUILD_QDPJIT_INTERFACE) -endif(QUDA_INTERFACE_QDPJIT) - -if(QUDA_INTERFACE_BQCD) - add_definitions(-DBUILD_BQCD_INTERFACE) -endif(QUDA_INTERFACE_BQCD) - -if(QUDA_INTERFACE_TIFR) - add_definitions(-DBUILD_TIFR_INTERFACE) -endif(QUDA_INTERFACE_TIFR) - -if(QUDA_NUMA_NVML) - add_definitions(-DNUMA_NVML) - set(NUMA_AFFINITY_OBJS numa_affinity.cpp) - find_package(NVML REQUIRED) - include_directories(SYSTEM NVML_INCLUDE_DIR) -endif(QUDA_NUMA_NVML) - -if(QUDA_CONTRACT) - add_definitions(-DGPU_CONTRACT) -endif(QUDA_CONTRACT) - -if(QUDA_COVDEV) - add_definitions(-DGPU_COVDEV) -endif(QUDA_COVDEV) - -# define FORTRAN FLAGS -set(CMAKE_F_FLAGS -std=c99 CACHE STRING "Fortran Flags") - -# derive whether we need to build the fortran interface -if(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_BQCD OR QUDA_ARPACK) - set(BUILD_FORTRAN_INTERFACE ON) +if(QUDA_OPENBLAS) enable_language(Fortran) -endif() - -# CUDA stuff - -set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE FILEPATH "Host compiler to be used by nvcc") -set(CMAKE_CUDA_STANDARD ${QUDA_CXX_STANDARD}) -set(CMAKE_CUDA_STANDARD_REQUIRED True) -mark_as_advanced(CMAKE_CUDA_HOST_COMPILER) -# NVCC FLAGS independent off build type - -set(QUDA_NVCC_FLAGS "-ftz=true -prec-div=false -prec-sqrt=false") -set(CMAKE_CUDA_FLAGS - "-Wno-deprecated-gpu-targets -arch=${QUDA_GPU_ARCH}" - CACHE STRING "Flags used by the CUDA compiler" FORCE) -if(QUDA_VERBOSE_BUILD) - set(CMAKE_CUDA_FLAGS - "-Wno-deprecated-gpu-targets -arch=${QUDA_GPU_ARCH} --ptxas-options=-v" - CACHE STRING "Flags used by the CUDA compiler" FORCE) -endif(QUDA_VERBOSE_BUILD) - -# define CUDA flags when CMake >= 3.8 -set(CMAKE_CUDA_DISABLE_XCOMPILER_WARNINGS - "-Wno-unknown-pragmas,-Wno-unused-function,-Wno-unused-local-typedef,-Wno-unused-private-field") -set(CMAKE_CUDA_FLAGS_DEVEL - "${QUDA_NVCC_FLAGS} -lineinfo -g -O3 -Xcompiler ${CMAKE_CUDA_DISABLE_XCOMPILER_WARNINGS}" - CACHE STRING "Flags used by the CUDA compiler during regular development builds.") -set(CMAKE_CUDA_FLAGS_STRICT - "${CMAKE_CUDA_FLAGS_DEVEL}" - CACHE STRING "Flags used by the CUDA compiler during strict jenkins builds.") -set(CMAKE_CUDA_FLAGS_RELEASE - "${QUDA_NVCC_FLAGS} -O3 -w" - CACHE STRING "Flags used by the CUDA compiler during release builds.") -set(CMAKE_CUDA_FLAGS_HOSTDEBUG - "${QUDA_NVCC_FLAGS} -g -lineinfo -DHOST_DEBUG" - CACHE STRING "Flags used by the C++ compiler during host-debug builds.") -set(CMAKE_CUDA_FLAGS_DEVICEDEBUG - "${QUDA_NVCC_FLAGS} -G -DDEVICE_DEBUG" - CACHE STRING "Flags used by the C++ compiler during device-debug builds.") -set(CMAKE_CUDA_FLAGS_DEBUG - "${QUDA_NVCC_FLAGS} -g -DHOST_DEBUG -DDEVICE_DEBUG -G" - CACHE STRING "Flags used by the C++ compiler during full (host+device) debug builds.") -set(CMAKE_CUDA_FLAGS_SANITIZE - "${QUDA_NVCC_FLAGS} -g -lineinfo -DHOST_DEBUG -Xcompiler -fsanitize=address,-fsanitize=undefined" - CACHE STRING "Flags used by the C++ compiler during sanitizer debug builds.") - -# CUDA Wrapper for finding libs etc -find_package(CUDAWrapper) - -# We need threads -find_package(Threads REQUIRED) - -# COMPILER OPTIONS and BUILD types -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) -include_directories(include) -include_directories(lib) -# if(QUDA_JITIFY) -include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) -# endif() + if(QUDA_DOWNLOAD_OPENBLAS) + include(GNUInstallDirs) + ExternalProject_Add( + OPENBLAS + GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_TAG v0.3.10 + GIT_SHALLOW YES + PREFIX openblas + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + CMAKE_GENERATOR "Unix Makefiles") + ExternalProject_Get_Property(OPENBLAS INSTALL_DIR) + set(QUDA_OPENBLAS_HOME ${INSTALL_DIR}) + add_library(openblas STATIC IMPORTED) + add_dependencies(openblas OPENBLAS) + set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran) + set_target_properties(openblas PROPERTIES IMPORTED_LOCATION + ${QUDA_OPENBLAS_HOME}/${CMAKE_INSTALL_LIBDIR}/libopenblas.a) + + else(QUDA_DOWNLOAD_OPENBLAS) + find_package(PkgConfig REQUIRED) -# QUDA_HASH for tunecache -if(NOT GITVERSION) - set(GITVERSION ${PROJECT_VERSION}) + pkg_check_modules(OPENBLAS QUIET openblas) + if(NOT OPENBLAS_FOUND OR QUDA_OPENBLAS_HOME) + find_library(OPENBLAS openblas PATH ${QUDA_OPENBLAS_HOME}) + else() + find_library(OPENBLAS ${OPENBLAS_LIBRARIES} PATH ${OPENBLAS_LIBRARY_DIRS}) + endif() + endif(QUDA_DOWNLOAD_OPENBLAS) +endif(QUDA_OPENBLAS) + + +# BACKWARDS +if(QUDA_BACKWARDS) + include(FetchContent) + FetchContent_Declare( + backward-cpp + GIT_REPOSITORY https://github.com/bombela/backward-cpp.git + GIT_TAG v1.5 + GIT_SHALLOW ON) + FetchContent_GetProperties(backward-cpp) + if(NOT backward-cpp_POPULATED) + FetchContent_Populate(backward-cpp) + endif() + include(${backward-cpp_SOURCE_DIR}/BackwardConfig.cmake) endif() -file(STRINGS ${CUDA_TOOLKIT_INCLUDE}/cuda.h CUDA_VERSIONLONG REGEX "\#define CUDA_VERSION") -string(REPLACE "\#define CUDA_VERSION " "" CUDA_VERSIONLONG ${CUDA_VERSIONLONG}) -string(STRIP CUDA_VERSIONLONG ${CUDA_VERSIONLONG}) -set(HASH cpu_arch=${CPU_ARCH},gpu_arch=${QUDA_GPU_ARCH},cuda_version=${CUDA_VERSIONLONG}) # this allows simplified running of clang-tidy if(${CMAKE_BUILD_TYPE} STREQUAL "DEVEL") set(CMAKE_EXPORT_COMPILE_COMMANDS ON) endif() -# build up git version add -debug to GITVERSION if we build with debug options enabled -string(REGEX MATCH [Dd][Ee][Bb][Uu][Gg] DEBUG_BUILD ${CMAKE_BUILD_TYPE}) -if(DEBUG_BUILD) - if(GITVERSION) - set(GITVERSION ${GITVERSION}-debug) - else() - set(GITVERSION debug) - endif() -endif() - -# GPU ARCH -set(GITVERSION ${GITVERSION}-${QUDA_GPU_ARCH}) -string(REGEX REPLACE sm_ "" COMP_CAP ${QUDA_GPU_ARCH}) -set(COMP_CAP "${COMP_CAP}0") - -if(${CUDA_VERSION} STREQUAL "10.2") - set(CMAKE_CUDA_FLAGS - "${CMAKE_CUDA_FLAGS} -Xcicc \"--Xllc -dag-vectorize-ops=1\"") -endif() - # make the compiler flags an advanced option for all user defined build types (cmake defined build types are advanced by # default ) mark_as_advanced(CMAKE_CUDA_FLAGS_DEVEL) @@ -897,9 +937,12 @@ mark_as_advanced(CMAKE_C_FLAGS_DEVICEDEBUG) mark_as_advanced(CMAKE_C_FLAGS_SANITIZE) mark_as_advanced(CMAKE_F_FLAGS) -set(BUILDNAME ${HASH}) +mark_as_advanced(CMAKE_EXE_LINKER_FLAGS_SANITIZE) + +# enable ctest include(CTest) -# add tests and quda library + +# add tests, utils, reference, and quda library add_subdirectory(lib) add_subdirectory(tests) add_subdirectory(doc) diff --git a/LICENSE b/LICENSE index fd94b8f63d..9be63f1e23 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,5 @@ -Copyright (c) 2009-2017, QUDA Developers +Copyright (c) 2009-2019, QUDA Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -240,7 +240,36 @@ following license: Additional Notices - QUDA utilizes Maxim Milakov's int_fastdiv library for fast run-time - integer division. This is distributed under the Apache License, - Version 2.0. See declaration at top of int_fastdiv.h for license - specifics. +QUDA utilizes Maxim Milakov's int_fastdiv library for fast run-time +integer division. This is distributed under the Apache License, +Version 2.0. See declaration at top of int_fastdiv.h for license +specifics. + +QUDA uses CLI11 for command line parsing. THE CLI11.hpp file is provided under +following license: + + CLI11 1.8 Copyright (c) 2017-2019 University of Cincinnati, developed by Henry + Schreiner under NSF AWARD 1414736. All rights reserved. + + Redistribution and use in source and binary forms of CLI11, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + 3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/NEWS b/NEWS index c588621729..144eabc281 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,93 @@ +Version 1.1.0 - October 2021 + +- Add support for NVSHMEM communication for the Dslash operators, for + significantly improved strong scaling. See + https://github.com/lattice/quda/wiki/Multi-GPU-with-NVSHMEM for more + details. + +- Addition of the MSPCG preconditioned CG solver for Möbius + fermions. See + https://github.com/lattice/quda/wiki/The-Multi-Splitting-Preconditioned-Conjugate-Gradient-(MSPCG),-an-application-of-the-additive-Schwarz-Method + for more details. + +- Addition of the Exact One Flavor Algorithm (EOFA) for Möbius + fermions. See + https://github.com/lattice/quda/wiki/The-Exact-One-Flavor-Algorithm-(EOFA) + for more details. + +- Addition of a fully GPU native Implicitly Restarted Arnoldi + eigensolver (as opposed to partially relying on ARPACK). See + https://github.com/lattice/quda/wiki/QUDA%27s-eigensolvers#implicitly-restarted-arnoldi-eigensolver + for more details. + +- Significantly reduced latency for reduction kernels through the use + of heterogeneous atomics. Requires CUDA 11.0+. + +- Addition of support for a split-grid multi-RHS solver. See + https://github.com/lattice/quda/wiki/Split-Grid for more details. + +- Continued work on enhancing and refining the staggered multigrid + algorithm. The MILC interface can now drive the staggered multigrid + solver. + +- Multigrid setup can now use tensor cores on Volta, Turing and Ampere + GPUs to accelerate the calculation. Enable with the + `QudaMultigridParam::use_mma` parameter. + +- Improved support of managed memory through the addition of a + prefetch API. This can dramatically improve the performance of the + multigrid setup when oversubscribing the memory. + +- Improved the performance of using MILC RHMC with QUDA + +- Add support for a new internal data order FLOAT8. This is the + default data order for nSpin=4 half and quarter precision fields, + though the prior FLOAT4 order can be enabled with the cmake option + QUDA_FLOAT8=OFF. + +- Remove of the singularity from the reconstruct-8 and reconstruct-9 + compressed gauge field ordering. This enables support for free + fields with these orderings. + +- The clover parameter convention has been codified: one can either + 1.) pass in QudaInvertParam::kappa and QudaInvertParam::csw + separately, and QUDA will infer the necessary clover coefficient, or + 2.) pass an explicit value of QudaInvertParam::clover_coeff + (e.g. CHROMA's use case) and that will override the above inference. + +- QUDA now includes fast-compilation options (QUDA_FAST_COMPILE_DSLASH + and QUDA_FAST_COMPILE_REUDCE) which enable much faster build times + for development at the expense of reduced performance. + +- Add support for compiling QUDA using clang for both the host and + device compiler. + +- While the bulk of the work associated with making QUDA portable to + different architectures will form the soul of QUDA 2.0, some of the + initial refactoring associated with this has been applied. + +- Significant cleanup of the tests directory to reduce boiler plate. + +- General improvements to the cmake build system using modern cmake + features. We now require cmake 3.15. + +- Extended the ctest list to include some optional benchmarks. + +- Fix a long-standing issue with multi-node Kepler GPU and Intel dual + socket systems. + +- Improved ASAN integration: SANITIZE builds now work out of the box + with no need to set the ASAN_OPTIONS environment variable. + +- Add support for the extended QIO branch (now required for MILC). + +- Bump QMP version to 2.5.3. + +- Updated to Eigen 3.3.9. + +- Multiple bug fixes and clean up to the library. Many of these are + listed here: https://github.com/lattice/quda/milestone/24?closed=1 + Version 1.0.0 - 10 January 2020 - Add support for CUDA 10.2: QUDA 1.0.0 is supported on CUDA 7.5-10.2 diff --git a/README.md b/README.md index ecf981e570..865347291d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# QUDA 1.0.0 +# QUDA 1.1.0 ## Overview @@ -29,21 +29,27 @@ QUDA includes an implementations of adaptive multigrid for the Wilson, clover-improved, twisted-mass and twisted-clover fermion actions. We note however that this is undergoing continued evolution and improvement and we highly recommend using adaptive multigrid use the -latest develop branch. More details can be found at -https://github.com/lattice/quda/wiki/Multigrid-Solver. +latest develop branch. More details can be found [here] +(https://github.com/lattice/quda/wiki/Multigrid-Solver). Support for eigen-vector deflation solvers is also included through -the Thick Restarted Lanczos Method (TRLM). For more details we refer -the user to the wiki -(https://github.com/lattice/quda/wiki/Deflated-Solvers). +the Thick Restarted Lanczos Method (TRLM), and we offer an Implicitly +Restarted Arnoldi for observing non-hermitian operator spectra. +For more details we refer the user to the wiki: +[QUDA's eigensolvers] +(https://github.com/lattice/quda/wiki/QUDA%27s-eigensolvers) +[Deflating coarse grid solves in Multigrid] +(https://github.com/lattice/quda/wiki/Multigrid-Solver#multigrid-inverter--lanczos) ## Software Compatibility: The library has been tested under Linux (CentOS 7 and Ubuntu 18.04) -using releases 7.5 through 10.2 of the CUDA toolkit. Earlier versions +using releases 10.1 through 11.4 of the CUDA toolkit. Earlier versions of the CUDA toolkit will not work, and we highly recommend the use of -10.x. QUDA has been tested in conjunction with x86-64, IBM -POWER8/POWER9 and ARM CPUs. CMake 3.11 or greater to required to build QUDA. +11.x. QUDA has been tested in conjunction with x86-64, IBM +POWER8/POWER9 and ARM CPUs. Both GCC and Clang host compilers are +supported, with the mininum recommended versions being 7.x and 6, respectively. +CMake 3.15 or greater to required to build QUDA. See also Known Issues below. @@ -59,25 +65,25 @@ capability" of your card, either from NVIDIA's documentation or by running the deviceQuery example in the CUDA SDK, and pass the appropriate value to the `QUDA_GPU_ARCH` variable in cmake. -QUDA 1.0.0, supports devices of compute capability 3.0 or greater. -While QUDA is no longer supported on the older Fermi architecture, it -may continue to work (assuming the user disables the use of textures -(QUDA_TEX=OFF). +QUDA 1.1.0, supports devices of compute capability 3.0 or greater. +QUDA is no longer supported on the older Tesla (1.x) and Fermi (2.x) +architectures. See also "Known Issues" below. ## Installation: -The recommended method for compiling QUDA is to use cmake, and build -QUDA in a separate directory from the source directory. For -instructions on how to build QUDA using cmake see this page -https://github.com/lattice/quda/wiki/Building-QUDA-with-cmake. Note that -this requires cmake version 3.11 or later. You can obtain cmake from -https://cmake.org/download/. On Linux the binary tar.gz archives unpack -into a cmake directory and usually run fine from that directory. +It is recommended to build QUDA in a separate directory from the +source directory. For instructions on how to build QUDA using cmake +see this page +https://github.com/lattice/quda/wiki/Building-QUDA-with-cmake. Note +that this requires cmake version 3.15 or later. You can obtain cmake +from https://cmake.org/download/. On Linux the binary tar.gz archives +unpack into a cmake directory and usually run fine from that +directory. -The basic steps for building cmake are: +The basic steps for building with cmake are: 1. Create a build dir, outside of the quda source directory. 2. In your build-dir run `cmake ` @@ -94,16 +100,26 @@ or specify e.g. -DQUDA_GPU_ARCH=sm_60 for a Pascal GPU in step 2. ### Multi-GPU support -QUDA supports using multiple GPUs through MPI and QMP. -To enable multi-GPU support either set `QUDA_MPI` or `QUDA_QMP` to ON when configuring QUDA through cmake. +QUDA supports using multiple GPUs through MPI and QMP, together with +the optional use of NVSHMEM GPU-initiated communication for improved +strong scaling of the Dirac operators. To enable multi-GPU support +either set `QUDA_MPI` or `QUDA_QMP` to ON when configuring QUDA +through cmake. -Note that in any case cmake will automatically try to detect your MPI installation. If you need to specify a particular MPI please set `MPI_C_COMPILER` and `MPI_CXX_COMPILER` in cmake. -See also https://cmake.org/cmake/help/v3.9/module/FindMPI.html for more help. +Note that in any case cmake will automatically try to detect your MPI +installation. If you need to specify a particular MPI please set +`MPI_C_COMPILER` and `MPI_CXX_COMPILER` in cmake. See also +https://cmake.org/cmake/help/v3.9/module/FindMPI.html for more help. For QMP please set `QUDA_QMP_HOME` to the installation directory of QMP. For more details see https://github.com/lattice/quda/wiki/Multi-GPU-Support +To enable NVSHMEM support set `QUDA_NVSHMEM` to ON, and set the +location of the local NVSHMEM installation with `QUDA_NVSHMEM_HOME`. +For more details see +https://github.com/lattice/quda/wiki/Multi-GPU-with-NVSHMEM + ### External dependencies The eigen-vector solvers (eigCG and incremental eigCG) by default will @@ -113,7 +129,7 @@ details). MAGMA is available from http://icl.cs.utk.edu/magma/index.html. MAGMA is enabled using the cmake option `QUDA_MAGMA=ON`. -Version 1.0.0 of QUDA includes interface for the external (P)ARPACK +Version 1.1.0 of QUDA includes interface for the external (P)ARPACK library for eigenvector computing. (P)ARPACK is available, e.g., from https://github.com/opencollab/arpack-ng. (P)ARPACK is enabled using CMake option `QUDA_ARPACK=ON`. Note that with a multi-GPU option, the @@ -168,7 +184,7 @@ communication and exterior update). ## Using the Library: Include the header file include/quda.h in your application, link against -lib/libquda.a, and study tests/invert_test.cpp (for Wilson, clover, +lib/libquda.so, and study tests/invert_test.cpp (for Wilson, clover, twisted-mass, or domain wall fermions) or tests/staggered_invert_test.cpp (for asqtad/HISQ fermions) for examples of the solver interface. The various solver options are enumerated in @@ -188,7 +204,7 @@ used on all GPUs and binary reproducibility. ## Getting Help: -Please visit http://lattice.github.com/quda for contact information. Bug +Please visit http://lattice.github.io/quda for contact information. Bug reports are especially welcome. @@ -209,7 +225,7 @@ Performance Computing, Networking, Storage and Analysis (SC), 2011 When taking advantage of adaptive multigrid, please also cite: -M. A. Clark, A. Strelchenko, M. Cheng, A. Gambhir, and R. Brower, +M. A. Clark, B. Joo, A. Strelchenko, M. Cheng, A. Gambhir, and R. Brower, "Accelerating Lattice QCD Multigrid on GPUs Using Fine-Grained Parallelization," International Conference for High Performance Computing, Networking, Storage and Analysis (SC), 2016 @@ -220,10 +236,14 @@ When taking advantage of block CG, please also cite: M. A. Clark, A. Strelchenko, A. Vaquero, M. Wagner, and E. Weinberg, "Pushing Memory Bandwidth Limitations Through Efficient Implementations of Block-Krylov Space Solvers on GPUs," -To be published in Comput. Phys. Commun. (2018) [arXiv:1710.09745 [hep-lat]]. +Comput. Phys. Commun. 233 (2018), 29-40 [arXiv:1710.09745 [hep-lat]]. + +When taking advantage of the Möbius MSPCG solver, please also cite: -Several other papers that might be of interest are listed at -http://lattice.github.com/quda . +Jiqun Tu, M. A. Clark, Chulwoo Jung, Robert Mawhinney, "Solving DWF +Dirac Equation Using Multi-splitting Preconditioned Conjugate Gradient +with Tensor Cores on NVIDIA GPUs," published in the Platform of +Advanced Scientific Computing (PASC21) [arXiv:2104.05615[hep-lat]]. ## Authors: @@ -237,27 +257,29 @@ http://lattice.github.com/quda . * Kate Clark (NVIDIA) * Michael Cheng (Boston University) * Carleton DeTar (Utah University) -* Justin Foley (NIH) -* Joel Giedt (Rensselaer Polytechnic Institute) +* Justin Foley (NIH) * Arjun Gambhir (William and Mary) +* Joel Giedt (Rensselaer Polytechnic Institute) * Steven Gottlieb (Indiana University) * Kyriakos Hadjiyiannakou (Cyprus) -* Dean Howarth (Boston University) -* Balint Joo (Jefferson Laboratory) +* Dean Howarth (Lawrence Livermore Lab, Lawrence Berkeley Lab) +* Balint Joo (OLCF, Oak Ridge National Laboratory, formerly Jefferson Lab) * Hyung-Jin Kim (Samsung Advanced Institute of Technology) * Bartek Kostrzewa (Bonn) -* Eloy Romero (William and Mary) +* James Osborn (Argonne National Laboratory) * Claudio Rebbi (Boston University) -* Guochun Shi (NCSA) +* Eloy Romero (William and Mary) * Hauke Sandmeyer (Bielefeld) * Mario Schröck (INFN) +* Guochun Shi (NCSA) * Alexei Strelchenko (Fermi National Accelerator Laboratory) -* Jiqun Tu (Columbia) +* Jiqun Tu (NVIDIA) * Alejandro Vaquero (Utah University) * Mathias Wagner (NVIDIA) * Andre Walker-Loud (Lawrence Berkley Laboratory) * Evan Weinberg (NVIDIA) -* Frank Winter (Jlab) +* Frank Winter (Jefferson Lab) +* Yi-Bo Yang (Chinese Academy of Sciences) Portions of this software were developed at the Innovative Systems Lab, diff --git a/cmake/FindCUDALibs.cmake b/cmake/FindCUDALibs.cmake index c7b5adf333..f283e7fa51 100644 --- a/cmake/FindCUDALibs.cmake +++ b/cmake/FindCUDALibs.cmake @@ -183,4 +183,5 @@ if(NOT CUDA_VERSION VERSION_LESS "7.0") endif() find_cuda_helper_libs(cuda) +set(CUDA_cuda_driver_LIBRARY ${CUDA_cuda_LIBRARY}) find_cuda_helper_libs(nvToolsExt) diff --git a/cmake/FindCUDAWrapper.cmake b/cmake/FindCUDAWrapper.cmake index 7d71e02a8c..0346c3ed59 100644 --- a/cmake/FindCUDAWrapper.cmake +++ b/cmake/FindCUDAWrapper.cmake @@ -5,54 +5,15 @@ # wrapper calls to help port cuda_add_executable / cuda_add_library over to # the new cmake cuda first class support -# FindCUDAWrapper.cmake - -#Very important the first step is to enable the CUDA language. -enable_language(CUDA) - # Find the CUDA_INCLUDE_DIRS and CUDA_TOOLKIT_INCLUDE like FindCUDA does -find_path(CUDA_TOOLKIT_INCLUDE +find_path(CUDAToolkit_INCLUDE_DIRS device_functions.h # Header included in toolkit PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} PATH_SUFFIXES include ../include NO_DEFAULT_PATH ) -mark_as_advanced(CUDA_TOOLKIT_INCLUDE) +mark_as_advanced(CUDAToolkit_INCLUDE_DIRS) set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE) -set(CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) - - -# Setup CUDA_LIBRARIES -set(CUDA_LIBRARIES ${CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES}) -if(APPLE) - # We need to add the default path to the driver (libcuda.dylib) as an rpath, so that - # the static cuda runtime can find it at runtime. - list(APPEND CUDA_LIBRARIES -Wl,-rpath,/usr/local/cuda/lib) -endif() - -# wrapper for cuda_add_library -# Issues: -# -function(cuda_add_library) - add_library(${ARGV}) - target_include_directories(${ARGV0} PUBLIC - ${CUDA_INCLUDE_DIRS}) - target_link_libraries(${ARGV0} PUBLIC ${CUDA_LIBRARIES}) - set_target_properties(${ARGV0} PROPERTIES LINKER_LANGUAGE CUDA) -endfunction() - - -# wrapper for cuda_add_library -# Issues: -# -function(cuda_add_executable) - add_executable(${ARGV}) - target_include_directories(${ARGV0} PUBLIC - ${CUDA_INCLUDE_DIRS}) - target_link_libraries(${ARGV0} ${CUDA_LIBRARIES}) - set_target_properties(${ARGV0} PROPERTIES LINKER_LANGUAGE CUDA) -endfunction() - find_package(CUDALibs) \ No newline at end of file diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 58fb5536c7..ea70cf76fc 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -1,18 +1,23 @@ -# add doxygen add doxygen documentation -# note that cmake 3.9 introduced a nicer way to do this but we don't want to require cmake 3.9 by default yet +# add doxygen add doxygen documentation note that cmake 3.9 introduced a nicer way to do this but we don't want to +# require cmake 3.9 by default yet option(QUDA_GENERATE_DOXYGEN "generate doxygen documentation") if(QUDA_GENERATE_DOXYGEN) -find_package(Doxygen ) + find_package(Doxygen) -if(DOXYGEN_FOUND) -if(DOXYGEN_DOT_FOUND) - get_filename_component(DOXYGEN_DOT_PATH ${DOXYGEN_DOT_EXECUTABLE} DIRECTORY) -endif() -set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${DOXYGEN_OUT} @ONLY) + if(DOXYGEN_FOUND) + if(DOXYGEN_DOT_FOUND) + get_filename_component(DOXYGEN_DOT_PATH ${DOXYGEN_DOT_EXECUTABLE} DIRECTORY) + endif() + set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${DOXYGEN_OUT} @ONLY) -add_custom_target(doc COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Generating doxygen documentation" VERBATIM) -endif() + add_custom_target( + doc + COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Generating doxygen documentation" + VERBATIM) + endif() endif() diff --git a/include/atomic.cuh b/include/atomic.cuh index 60640d2464..6f5c2e7479 100644 --- a/include/atomic.cuh +++ b/include/atomic.cuh @@ -152,3 +152,19 @@ static inline __device__ float atomicMax(float *addr, float val){ return __uint_as_float(old); } + +/** + @brief Implementation of single-precision atomic max specialized + for positive-definite numbers. Here we take advantage of the + property that when positive floating point numbers are + reinterpretted as unsigned integers, they have the same unique + sorted order. + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline __device__ float atomicAbsMax(float *addr, float val){ + uint32_t val_ = __float_as_uint(val); + uint32_t *addr_ = reinterpret_cast(addr); + return atomicMax(addr_, val_); +} diff --git a/include/blas_cublas.h b/include/blas_cublas.h deleted file mode 100644 index 97e4551b51..0000000000 --- a/include/blas_cublas.h +++ /dev/null @@ -1,32 +0,0 @@ -#include - -#pragma once - -namespace quda { - namespace cublas { - - /** - @brief Create the CUBLAS context - */ - void init(); - - /** - @brief Destroy the CUBLAS context - */ - void destroy(); - - /** - Batch inversion the matrix field using an LU decomposition method. - @param[out] Ainv Matrix field containing the inverse matrices - @param[in] A Matrix field containing the input matrices - @param[in] n Dimension each matrix - @param[in] batch Problem batch size - @param[in] precision Precision of the input/output data - @param[in] Location of the input/output data - @return Number of flops done in this computation - */ - long long BatchInvertMatrix(void *Ainv, void* A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location); - - } // namespace cublas - -} // namespace quda diff --git a/include/blas_helper.cuh b/include/blas_helper.cuh index c2781d1038..43ffdf0533 100644 --- a/include/blas_helper.cuh +++ b/include/blas_helper.cuh @@ -7,53 +7,16 @@ #include #endif -// these definitions are used to avoid calling -// std::complex::real/imag which have C++11 ABI incompatibility -// issues with certain versions of GCC - -#define REAL(a) (*((double *)&a)) -#define IMAG(a) (*((double *)&a + 1)) - namespace quda { - inline void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b) - { - if (a.Length() != b.Length()) errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); - if (a.Stride() != b.Stride()) errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); - } - - inline void checkLength(const ColorSpinorField &a, const ColorSpinorField &b) - { - if (a.Length() != b.Length()) errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); - if (a.Stride() != b.Stride()) errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); - } - -#ifdef QUAD_SUM -#define QudaSumFloat doubledouble -#define QudaSumFloat2 doubledouble2 -#define QudaSumFloat3 doubledouble3 - template <> struct scalar { - typedef doubledouble type; - }; - template <> struct scalar { - typedef doubledouble type; - }; - template <> struct scalar { - typedef doubledouble type; - }; - template <> struct scalar { - typedef doubledouble type; - }; - template <> struct vector { - typedef doubledouble2 type; + template struct memory_access { + static constexpr bool X = X_; + static constexpr bool Y = Y_; + static constexpr bool Z = Z_; + static constexpr bool W = W_; + static constexpr bool V = V_; }; -#else -#define QudaSumFloat double -#define QudaSumFloat2 double2 -#define QudaSumFloat3 double3 -#define QudaSumFloat4 double4 -#endif __host__ __device__ inline double set(double &x) { return x; } __host__ __device__ inline double2 set(double2 &x) { return x; } @@ -97,4 +60,427 @@ namespace quda } #endif + // Vector types used for AoS load-store on CPU + template <> struct VectorType { + using type = vector_type; + }; + template <> struct VectorType { + using type = vector_type; + }; + template <> struct VectorType { + using type = vector_type; + }; + template <> struct VectorType { + using type = vector_type; + }; + template <> struct VectorType { + using type = vector_type; + }; + template <> struct VectorType { + using type = vector_type; + }; + template <> struct VectorType { + using type = vector_type; + }; + template <> struct VectorType { + using type = vector_type; + }; + + namespace blas + { + + template struct SpinorNorm { + using norm_t = float; + norm_t *norm; + unsigned int cb_norm_offset; + + SpinorNorm() : norm(nullptr), cb_norm_offset(0) {} + + SpinorNorm(const ColorSpinorField &x) : + norm((norm_t *)x.Norm()), + cb_norm_offset(x.NormBytes() / (2 * sizeof(norm_t))) + { + } + + SpinorNorm(const SpinorNorm &sn) : norm(sn.norm), cb_norm_offset(sn.cb_norm_offset) {} + + SpinorNorm &operator=(const SpinorNorm &src) + { + if (&src != this) { + norm = src.norm; + cb_norm_offset = src.cb_norm_offset; + } + return *this; + } + + void set(const ColorSpinorField &x) + { + norm = (norm_t *)x.Norm(); + cb_norm_offset = x.NormBytes() / (2 * sizeof(norm_t)); + } + + __device__ __host__ inline norm_t load_norm(const int i, const int parity = 0) const + { + return norm[cb_norm_offset * parity + i]; + } + + template + __device__ __host__ inline norm_t store_norm(const vector_type, n> &v, int x, int parity) + { + norm_t max_[n]; + // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued) +#pragma unroll + for (int i = 0; i < n; i++) max_[i] = fmaxf(fabsf((norm_t)v[i].real()), fabsf((norm_t)v[i].imag())); + norm_t scale = 0.0; +#pragma unroll + for (int i = 0; i < n; i++) scale = fmaxf(max_[i], scale); + norm[x + parity * cb_norm_offset] = scale; + +#ifdef __CUDA_ARCH__ + return __fdividef(fixedMaxValue::value, scale); +#else + return fixedMaxValue::value / scale; +#endif + } + + norm_t *Norm() { return norm; } + }; + + template struct SpinorNorm { + using norm_t = float; + SpinorNorm() {} + SpinorNorm(const ColorSpinorField &x) {} + SpinorNorm(const SpinorNorm &sn) {} + SpinorNorm &operator=(const SpinorNorm &src) { return *this; } + void set(const ColorSpinorField &x) {} + __device__ __host__ inline norm_t load_norm(const int i, const int parity = 0) const { return 1.0; } + template + __device__ __host__ inline norm_t store_norm(const vector_type, n> &v, int x, int parity) + { + return 1.0; + } + void backup(char **norm_h, size_t norm_bytes) {} + void restore(char **norm_h, size_t norm_bytes) {} + norm_t *Norm() { return nullptr; } + }; + + /** + @param RegType Register type used in kernel + @param InterType Intermediate format - RegType precision with StoreType ordering + @param StoreType Type used to store field in memory + @param N Length of vector of RegType elements that this Spinor represents + */ + template struct Spinor : SpinorNorm::value> { + using SN = SpinorNorm::value>; + using Vector = typename VectorType::type; + store_t *spinor; + int stride; + unsigned int cb_offset; + + Spinor() : SN(), spinor(nullptr), stride(0), cb_offset(0) {} + + Spinor(const ColorSpinorField &x) : + SN(x), + spinor(static_cast(const_cast(x).V())), + stride(x.Stride()), + cb_offset(x.Bytes() / (2 * sizeof(store_t) * N)) + { + } + + Spinor(const Spinor &st) : SN(st), spinor(st.spinor), stride(st.stride), cb_offset(st.cb_offset) {} + + Spinor &operator=(const Spinor &src) + { + if (&src != this) { + SN::operator=(src); + spinor = src.spinor; + stride = src.stride; + cb_offset = src.cb_offset; + } + return *this; + } + + void set(const ColorSpinorField &x) + { + SN::set(x); + spinor = static_cast(const_cast(x).V()); + stride = x.Stride(); + cb_offset = x.Bytes() / (2 * sizeof(store_t) * N); + } + + template + __device__ __host__ inline void load(vector_type, n> &v, int x, int parity = 0) const + { + constexpr int len = 2 * n; // real-valued length + float nrm = isFixed::value ? SN::load_norm(x, parity) : 0.0; + + vector_type v_; + + constexpr int M = len / N; +#pragma unroll + for (int i = 0; i < M; i++) { + // first load from memory + Vector vecTmp = vector_load(spinor, parity * cb_offset + x + stride * i); + // now copy into output and scale +#pragma unroll + for (int j = 0; j < N; j++) copy_and_scale(v_[i * N + j], reinterpret_cast(&vecTmp)[j], nrm); + } + + for (int i = 0; i < n; i++) { v[i] = complex(v_[2 * i + 0], v_[2 * i + 1]); } + } + + template + __device__ __host__ inline void save(const vector_type, n> &v, int x, int parity = 0) + { + constexpr int len = 2 * n; // real-valued length + vector_type v_; + + if (isFixed::value) { + real scale_inv = SN::template store_norm(v, x, parity); +#pragma unroll + for (int i = 0; i < n; i++) { + v_[2 * i + 0] = scale_inv * v[i].real(); + v_[2 * i + 1] = scale_inv * v[i].imag(); + } + } else { +#pragma unroll + for (int i = 0; i < n; i++) { + v_[2 * i + 0] = v[i].real(); + v_[2 * i + 1] = v[i].imag(); + } + } + + constexpr int M = len / N; +#pragma unroll + for (int i = 0; i < M; i++) { + Vector vecTmp; + // first do scalar copy converting into storage type +#pragma unroll + for (int j = 0; j < N; j++) copy_scaled(reinterpret_cast(&vecTmp)[j], v_[i * N + j]); + // second do vectorized copy into memory + vector_store(spinor, parity * cb_offset + x + stride * i, vecTmp); + } + } + }; + + // n_vector defines the granularity of load/store, e.g., sets the + // size of vector we load from memory + template constexpr int n_vector() { return 0; } + + // native ordering + template <> constexpr int n_vector() { return 2; } + template <> constexpr int n_vector() { return 2; } + + template <> constexpr int n_vector() { return 2; } + template <> constexpr int n_vector() { return 2; } + + template <> constexpr int n_vector() { return 4; } + template <> constexpr int n_vector() { return 4; } + + template <> constexpr int n_vector() { return 4; } + template <> constexpr int n_vector() { return 2; } + +#ifdef FLOAT8 + template <> constexpr int n_vector() { return 8; } +#else + template <> constexpr int n_vector() { return 4; } +#endif + template <> constexpr int n_vector() { return 2; } + +#ifdef FLOAT8 + template <> constexpr int n_vector() { return 8; } +#else + template <> constexpr int n_vector() { return 4; } +#endif + template <> constexpr int n_vector() { return 2; } + + // Just use float-2/float-4 ordering on CPU when not site unrolling + template <> constexpr int n_vector() { return 2; } + template <> constexpr int n_vector() { return 2; } + template <> constexpr int n_vector() { return 4; } + template <> constexpr int n_vector() { return 4; } + + // AoS ordering is used on CPU uses when we are site unrolling + template <> constexpr int n_vector() { return 24; } + template <> constexpr int n_vector() { return 6; } + template <> constexpr int n_vector() { return 24; } + template <> constexpr int n_vector() { return 6; } + template <> constexpr int n_vector() { return 24; } + template <> constexpr int n_vector() { return 6; } + template <> constexpr int n_vector() { return 24; } + template <> constexpr int n_vector() { return 6; } + +#if defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ <= 9 +#define constexpr +#endif + + template