diff --git a/Modules/FindCUDA.cmake b/Modules/FindCUDA.cmake index b11105077..86f89d8bc 100644 --- a/Modules/FindCUDA.cmake +++ b/Modules/FindCUDA.cmake @@ -199,6 +199,24 @@ # specified by CUDA_64_BIT_DEVICE_CODE. Note that this is a function # instead of a macro. # +# CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures]) +# -- Selects GPU arch flags for nvcc based on target_CUDA_architectures +# target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...) +# - "Auto" detects local machine GPU compute arch at runtime. +# - "Common" and "All" cover common and entire subsets of architectures +# ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX +# NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal +# NUM: Any number. Only those pairs are currently accepted by NVCC though: +# 2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 +# Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable} +# Additionally, sets ${out_variable}_readable to the resulting numeric list +# Example: +# CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell) +# LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS}) +# +# More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA +# Note that this is a function instead of a macro. +# # CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ... # [STATIC | SHARED | MODULE] [OPTIONS ...] ) # -- This is where all the magic happens. CUDA_ADD_EXECUTABLE, @@ -562,6 +580,7 @@ macro(cuda_unset_include_and_libraries) unset(CUDA_nvcuvenc_LIBRARY CACHE) unset(CUDA_nvcuvid_LIBRARY CACHE) unset(CUDA_USE_STATIC_CUDA_RUNTIME CACHE) + unset(CUDA_GPU_DETECT_OUTPUT CACHE) endmacro() # Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed, @@ -577,21 +596,21 @@ if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL cuda_unset_include_and_libraries() endif() -if(NOT "${CUDA_SDK_ROOT_DIR}" STREQUAL "${CUDA_SDK_ROOT_DIR_INTERNAL}") - # No specific variables to catch. Use this kind of code before calling - # find_package(CUDA) to clean up any variables that may depend on this path. +# +# End of unset() +# - # unset(MY_SPECIAL_CUDA_SDK_INCLUDE_DIR CACHE) - # unset(MY_SPECIAL_CUDA_SDK_LIBRARY CACHE) -endif() +# +# Start looking for things +# # Search for the cuda distribution. -if(NOT CUDA_TOOLKIT_ROOT_DIR) - +if(NOT CUDA_TOOLKIT_ROOT_DIR AND NOT CMAKE_CROSSCOMPILING) # Search in the CUDA_BIN_PATH first. find_path(CUDA_TOOLKIT_ROOT_DIR NAMES nvcc nvcc.exe PATHS + ENV CUDA_TOOLKIT_ROOT ENV CUDA_PATH ENV CUDA_BIN_PATH PATH_SUFFIXES bin bin64 @@ -611,6 +630,7 @@ if(NOT CUDA_TOOLKIT_ROOT_DIR) string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR}) # We need to force this back into the cache. set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE) + set(CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR}) endif() if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR}) @@ -622,8 +642,45 @@ if(NOT CUDA_TOOLKIT_ROOT_DIR) endif () endif () +if(CMAKE_CROSSCOMPILING) + SET (CUDA_TOOLKIT_ROOT $ENV{CUDA_TOOLKIT_ROOT}) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") + # Support for NVPACK + set (CUDA_TOOLKIT_TARGET_NAME "armv7-linux-androideabi") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") + # Support for arm cross compilation + set(CUDA_TOOLKIT_TARGET_NAME "armv7-linux-gnueabihf") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + # Support for aarch64 cross compilation + if (ANDROID_ARCH_NAME STREQUAL "arm64") + set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux-androideabi") + else() + set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux") + endif (ANDROID_ARCH_NAME STREQUAL "arm64") + endif() + + if (EXISTS "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}") + set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}" CACHE PATH "CUDA Toolkit target location.") + SET (CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT}) + mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR) + endif() + + # add known CUDA targetr root path to the set of directories we search for programs, libraries and headers + set( CMAKE_FIND_ROOT_PATH "${CUDA_TOOLKIT_TARGET_DIR};${CMAKE_FIND_ROOT_PATH}") + macro( cuda_find_host_program ) + find_host_program( ${ARGN} ) + endmacro() +else() + # for non-cross-compile, find_host_program == find_program and CUDA_TOOLKIT_TARGET_DIR == CUDA_TOOLKIT_ROOT_DIR + macro( cuda_find_host_program ) + find_program( ${ARGN} ) + endmacro() + SET (CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR}) +endif() + + # CUDA_NVCC_EXECUTABLE -find_program(CUDA_NVCC_EXECUTABLE +cuda_find_host_program(CUDA_NVCC_EXECUTABLE NAMES nvcc PATHS "${CUDA_TOOLKIT_ROOT_DIR}" ENV CUDA_PATH @@ -632,7 +689,7 @@ find_program(CUDA_NVCC_EXECUTABLE NO_DEFAULT_PATH ) # Search default search paths, after we search our own set of paths. -find_program(CUDA_NVCC_EXECUTABLE nvcc) +cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc) mark_as_advanced(CUDA_NVCC_EXECUTABLE) if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION) @@ -648,33 +705,14 @@ else() string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}") endif() + # Always set this convenience variable set(CUDA_VERSION_STRING "${CUDA_VERSION}") -# Support for arm cross compilation with CUDA 5.5 -if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf") - set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf" CACHE PATH "Toolkit target location.") -# Support for aarch64 cross compilation with CUDA 7.0 -elseif(CUDA_VERSION VERSION_GREATER "6.5" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux") - set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux" CACHE PATH "Toolkit target location.") -else() - set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE PATH "Toolkit target location.") -endif() -mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR) - -# Target CPU architecture -if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm") - set(_cuda_target_cpu_arch_initial "ARM") -else() - set(_cuda_target_cpu_arch_initial "") -endif() -set(CUDA_TARGET_CPU_ARCH ${_cuda_target_cpu_arch_initial} CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.") -mark_as_advanced(CUDA_TARGET_CPU_ARCH) - # CUDA_TOOLKIT_INCLUDE find_path(CUDA_TOOLKIT_INCLUDE device_functions.h # Header included in toolkit - PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}" + PATHS ${CUDA_TOOLKIT_TARGET_DIR} ENV CUDA_PATH ENV CUDA_INC_PATH PATH_SUFFIXES include @@ -704,19 +742,21 @@ macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext ) # (lib/Win32) and the old path (lib). find_library(${_var} NAMES ${_names} - PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}" + PATHS "${CUDA_TOOLKIT_TARGET_DIR}" ENV CUDA_PATH ENV CUDA_LIB_PATH PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32" DOC ${_doc} NO_DEFAULT_PATH ) - # Search default search paths, after we search our own set of paths. - find_library(${_var} - NAMES ${_names} - PATHS "/usr/lib/nvidia-current" - DOC ${_doc} - ) + if (NOT CMAKE_CROSSCOMPILING) + # Search default search paths, after we search our own set of paths. + find_library(${_var} + NAMES ${_names} + PATHS "/usr/lib/nvidia-current" + DOC ${_doc} + ) + endif() endmacro() macro(cuda_find_library_local_first _var _names _doc) @@ -737,7 +777,8 @@ if(CUDA_VERSION VERSION_EQUAL "3.0") CUDA_CUDARTEMU_LIBRARY ) endif() -if(NOT CUDA_VERSION VERSION_LESS "5.5") + +if(CUDA_USE_STATIC_CUDA_RUNTIME AND NOT CUDA_VERSION VERSION_LESS "5.5") cuda_find_library_local_first(CUDA_cudart_static_LIBRARY cudart_static "static CUDA runtime library") mark_as_advanced(CUDA_cudart_static_LIBRARY) endif() @@ -773,12 +814,12 @@ if(CUDA_USE_STATIC_CUDA_RUNTIME) else() unset(CMAKE_THREAD_PREFER_PTHREAD) endif() - if (NOT APPLE) - # Here is librt that has things such as, clock_gettime, shm_open, and shm_unlink. - find_library(CUDA_rt_LIBRARY rt) - if (NOT CUDA_rt_LIBRARY) - message(WARNING "Expecting to find librt for libcudart_static, but didn't find it.") - endif() + endif() + if (NOT APPLE AND CUDA_VERSION VERSION_LESS "7.0") + # Before CUDA 7.0, there was librt that has things such as, clock_gettime, shm_open, and shm_unlink. + find_library(CUDA_rt_LIBRARY rt) + if (NOT CUDA_rt_LIBRARY) + message(WARNING "Expecting to find librt for libcudart_static, but didn't find it.") endif() endif() endif() @@ -988,6 +1029,7 @@ endmacro() cuda_find_helper_file(parse_cubin cmake) cuda_find_helper_file(make2cmake cmake) cuda_find_helper_file(run_nvcc cmake) +include("${CMAKE_CURRENT_LIST_DIR}/FindCUDA/select_compute_arch.cmake") ############################################################################## # Separate the OPTIONS out from the sources diff --git a/Modules/FindCUDA/select_compute_arch.cmake b/Modules/FindCUDA/select_compute_arch.cmake new file mode 100644 index 000000000..d51683193 --- /dev/null +++ b/Modules/FindCUDA/select_compute_arch.cmake @@ -0,0 +1,195 @@ +# Synopsis: +# CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures]) +# -- Selects GPU arch flags for nvcc based on target_CUDA_architectures +# target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...) +# - "Auto" detects local machine GPU compute arch at runtime. +# - "Common" and "All" cover common and entire subsets of architectures +# ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX +# NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal +# NUM: Any number. Only those pairs are currently accepted by NVCC though: +# 2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 +# Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable} +# Additionally, sets ${out_variable}_readable to the resulting numeric list +# Example: +# CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell) +# LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS}) +# +# More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA +# + +# This list will be used for CUDA_ARCH_NAME = All option +set(CUDA_KNOWN_GPU_ARCHITECTURES "Fermi" "Kepler" "Maxwell") + +# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default) +set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0") + +if (CUDA_VERSION VERSION_GREATER "6.5") + list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2") +endif () + +if (CUDA_VERSION VERSION_GREATER "7.5") + list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1" "6.1+PTX") +else() + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX") +endif () + + + +################################################################################################ +# A function for automatic detection of GPUs installed (if autodetection is enabled) +# Usage: +# CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE) +# +function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE) + if(NOT CUDA_GPU_DETECT_OUTPUT) + set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) + + file(WRITE ${cufile} "" + "#include \n" + "int main()\n" + "{\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" + " if (count == 0) return -1;\n" + " for (int device = 0; device < count; ++device)\n" + " {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n") + + execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(nvcc_res EQUAL 0) + string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") + set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE) + endif() + endif() + + if(NOT CUDA_GPU_DETECT_OUTPUT) + message(STATUS "Automatic GPU detection failed. Building for common architectures.") + set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE) + else() + set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT} PARENT_SCOPE) + endif() +endfunction() + + +################################################################################################ +# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list +# Usage: +# SELECT_NVCC_ARCH_FLAGS(out_variable [list of CUDA compute archs]) +function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable) + set(CUDA_ARCH_LIST "${ARGN}") + + if("X${CUDA_ARCH_LIST}" STREQUAL "X" ) + set(CUDA_ARCH_LIST "Auto") + endif() + + set(cuda_arch_bin) + set(cuda_arch_ptx) + + if("${CUDA_ARCH_LIST}" STREQUAL "All") + set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES}) + elseif("${CUDA_ARCH_LIST}" STREQUAL "Common") + set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES}) + elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto") + CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST) + message(STATUS "Autodetected CUDA architecture(s): ${cuda_arch_bin}") + endif() + + # Now process the list and look for names + string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}") + list(REMOVE_DUPLICATES CUDA_ARCH_LIST) + foreach(arch_name ${CUDA_ARCH_LIST}) + set(arch_bin) + set(add_ptx FALSE) + # Check to see if we are compiling PTX + if(arch_name MATCHES "(.*)\\+PTX$") + set(add_ptx TRUE) + set(arch_name ${CMAKE_MATCH_1}) + endif() + if(arch_name MATCHES "([0-9]\\.[0-9])$") + set(arch_bin ${CMAKE_MATCH_1}) + set(arch_ptx ${arch_bin}) + else() + # Look for it in our list of known architectures + if(${arch_name} STREQUAL "Fermi") + set(arch_bin 2.0 "2.1(2.0)") + elseif(${arch_name} STREQUAL "Kepler+Tegra") + set(arch_bin 3.2) + elseif(${arch_name} STREQUAL "Kepler+Tesla") + set(arch_bin 3.7) + elseif(${arch_name} STREQUAL "Kepler") + set(arch_bin 3.0 3.5) + set(arch_ptx 3.5) + elseif(${arch_name} STREQUAL "Maxwell+Tegra") + set(arch_bin 5.3) + elseif(${arch_name} STREQUAL "Maxwell") + set(arch_bin 5.0 5.2) + set(arch_ptx 5.2) + elseif(${arch_name} STREQUAL "Pascal") + set(arch_bin 6.0 6.1) + set(arch_ptx 6.1) + else() + message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS") + endif() + endif() + if(NOT arch_bin) + message(SEND_ERROR "arch_bin wasn't set for some reason") + endif() + list(APPEND cuda_arch_bin ${arch_bin}) + if(add_ptx) + if (NOT arch_ptx) + set(arch_ptx ${arch_bin}) + endif() + list(APPEND cuda_arch_ptx ${arch_ptx}) + endif() + endforeach() + + # remove dots and convert to lists + string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}") + string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + + if(cuda_arch_bin) + list(REMOVE_DUPLICATES cuda_arch_bin) + endif() + if(cuda_arch_ptx) + list(REMOVE_DUPLICATES cuda_arch_ptx) + endif() + + set(nvcc_flags "") + set(nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(arch ${cuda_arch_bin}) + if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified PTX for the concrete BIN + list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) + list(APPEND nvcc_archs_readable sm_${arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(arch ${cuda_arch_ptx}) + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) + list(APPEND nvcc_archs_readable compute_${arch}) + endforeach() + + string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") + set(${out_variable} ${nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) +endfunction()