Cycles: Restore cycles_cubin_cc to working order
authorRay Molenkamp <github@lazydodo.com>
Thu, 26 Mar 2020 17:41:44 +0000 (11:41 -0600)
committerRay Molenkamp <github@lazydodo.com>
Thu, 26 Mar 2020 17:41:44 +0000 (11:41 -0600)
Reviewed by: brecht pmoursnv
Differential Revision: https://developer.blender.org/D7136

intern/cycles/CMakeLists.txt
intern/cycles/app/cycles_cubin_cc.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernel_compat_cuda.h
intern/cycles/kernel/kernel_compat_optix.h
intern/cycles/util/util_static_assert.h

index 024a94ea437fd8d410d5ed81bbac3ee0629c669e..121c8bdad6ef03dab8ae5949f17d5f9aaf71b577 100644 (file)
@@ -313,9 +313,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
       set(MAX_MSVC 1910)
     elseif(${CUDA_VERSION} EQUAL "9.1")
       set(MAX_MSVC 1911)
-    elseif(${CUDA_VERSION} EQUAL "10.0")
-      set(MAX_MSVC 1999)
-    elseif(${CUDA_VERSION} EQUAL "10.1")
+    elseif(${CUDA_VERSION} LESS "11.0")
       set(MAX_MSVC 1999)
     endif()
     if(NOT MSVC_VERSION LESS ${MAX_MSVC} OR CMAKE_C_COMPILER_ID MATCHES "Clang")
@@ -332,7 +330,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
 endif()
 
 # NVRTC gives wrong rendering result in CUDA 10.0, so we must use NVCC.
-if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_CUBIN_COMPILER)
+if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_CUBIN_COMPILER AND NOT WITH_CYCLES_CUBIN_COMPILER_OVERRRIDE)
   if(NOT (${CUDA_VERSION} VERSION_LESS 10.0))
     message(STATUS "cycles_cubin_cc not supported for CUDA 10.0+, using nvcc instead.")
     set(WITH_CYCLES_CUBIN_COMPILER OFF)
index 9d4d52b34acdf1e76fb97cb6713a024e8589b775..7631cb9bed569bb174da1e2e08cc079efec61ac8 100644 (file)
@@ -43,7 +43,8 @@ template<typename T> std::string to_string(const T &n)
 
 class CompilationSettings {
  public:
-  CompilationSettings() : target_arch(0), bits(64), verbose(false), fast_math(false)
+  CompilationSettings()
+      : target_arch(0), bits(64), verbose(false), fast_math(false), ptx_only(false)
   {
   }
 
@@ -57,12 +58,13 @@ class CompilationSettings {
   int bits;
   bool verbose;
   bool fast_math;
+  bool ptx_only;
 };
 
 static bool compile_cuda(CompilationSettings &settings)
 {
-  const char *headers[] = {"stdlib.h", "float.h", "math.h", "stdio.h"};
-  const char *header_content[] = {"\n", "\n", "\n", "\n"};
+  const char *headers[] = {"stdlib.h", "float.h", "math.h", "stdio.h", "stddef.h"};
+  const char *header_content[] = {"\n", "\n", "\n", "\n", "\n"};
 
   printf("Building %s\n", settings.input_file.c_str());
 
@@ -83,6 +85,8 @@ static bool compile_cuda(CompilationSettings &settings)
   options.push_back("-D__KERNEL_CUDA_VERSION__=" + std::to_string(cuewNvrtcVersion()));
   options.push_back("-arch=compute_" + std::to_string(settings.target_arch));
   options.push_back("--device-as-default-execution-space");
+  options.push_back("-DCYCLES_CUBIN_CC");
+  options.push_back("--std=c++11");
   if (settings.fast_math)
     options.push_back("--use_fast_math");
 
@@ -134,10 +138,14 @@ static bool compile_cuda(CompilationSettings &settings)
     fprintf(stderr, "Error: nvrtcGetPTX failed (%d)\n\n", (int)result);
     return false;
   }
-
-  /* Write a file in the temp folder with the ptx code. */
-  settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" +
-                      OIIO::Filesystem::unique_path();
+  if (settings.ptx_only) {
+    settings.ptx_file = settings.output_file;
+  }
+  else {
+    /* Write a file in the temp folder with the ptx code. */
+    settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" +
+                        OIIO::Filesystem::unique_path();
+  }
   FILE *f = fopen(settings.ptx_file.c_str(), "wb");
   fwrite(&ptx_code[0], 1, ptx_size, f);
   fclose(f);
@@ -249,6 +257,9 @@ static bool parse_parameters(int argc, const char **argv, CompilationSettings &s
              "-D %L",
              &settings.defines,
              "Add additional defines",
+             "-ptx",
+             &settings.ptx_only,
+             "emit PTX code",
              "-v",
              &settings.verbose,
              "Use verbose logging",
@@ -303,8 +314,10 @@ int main(int argc, const char **argv)
     exit(EXIT_FAILURE);
   }
 
-  if (!link_ptxas(settings)) {
-    exit(EXIT_FAILURE);
+  if (!settings.ptx_only) {
+    if (!link_ptxas(settings)) {
+      exit(EXIT_FAILURE);
+    }
   }
 
   return 0;
index 0dd0da65f822c007e0db7fa571f6c9925046276b..3264b5afea2d3adfd11ac9871b55e488a6af934e 100644 (file)
@@ -452,7 +452,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       endif()
 
       add_custom_command(
-        OUTPUT ${cuda_cubin}
+        OUTPUT ${cuda_file}
         COMMAND ${CUBIN_CC_ENV}
             "$<TARGET_FILE:cycles_cubin_cc>"
             -target ${CUDA_ARCH}
@@ -461,7 +461,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
             -v
             -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
         DEPENDS ${kernel_sources} cycles_cubin_cc)
-      set(cuda_file ${cuda_cubin})
     else()
       add_custom_command(
         OUTPUT ${cuda_file}
@@ -517,7 +516,6 @@ if(WITH_CYCLES_DEVICE_OPTIX)
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
       -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
-      -arch=sm_30
       --use_fast_math
       -o ${output})
 
@@ -525,25 +523,62 @@ if(WITH_CYCLES_DEVICE_OPTIX)
       set(cuda_flags ${cuda_flags}
         -D __KERNEL_DEBUG__)
     endif()
+    if(WITH_CYCLES_CUBIN_COMPILER)
 
-    add_custom_command(
-      OUTPUT
-        ${output}
-      DEPENDS
-        ${input}
-        ${SRC_HEADERS}
-        ${SRC_KERNELS_CUDA_HEADERS}
-        ${SRC_KERNELS_OPTIX_HEADERS}
-        ${SRC_BVH_HEADERS}
-        ${SRC_SVM_HEADERS}
-        ${SRC_GEOM_HEADERS}
-        ${SRC_CLOSURE_HEADERS}
-        ${SRC_UTIL_HEADERS}
-      COMMAND
-        ${CUDA_NVCC_EXECUTABLE} --ptx ${cuda_flags} ${input}
-      WORKING_DIRECTORY
-        "${CMAKE_CURRENT_SOURCE_DIR}")
+      # Needed to find libnvrtc-builtins.so. Can't do it from inside
+      # cycles_cubin_cc since the env variable is read before main()
+      if(APPLE)
+        set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+          -E env DYLD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib")
+      elseif(UNIX)
+        set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+          -E env LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      endif()
 
+      add_custom_command(
+        OUTPUT ${output}
+        DEPENDS
+          ${input}
+          ${SRC_HEADERS}
+          ${SRC_KERNELS_CUDA_HEADERS}
+          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_BVH_HEADERS}
+          ${SRC_SVM_HEADERS}
+          ${SRC_GEOM_HEADERS}
+          ${SRC_CLOSURE_HEADERS}
+          ${SRC_UTIL_HEADERS}
+        COMMAND ${CUBIN_CC_ENV}
+            "$<TARGET_FILE:cycles_cubin_cc>"
+            -target 30
+            -ptx
+            -i ${CMAKE_CURRENT_SOURCE_DIR}/${input}
+            ${cuda_flags}
+            -v
+            -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
+        DEPENDS ${kernel_sources} cycles_cubin_cc)
+    else()
+      add_custom_command(
+        OUTPUT
+          ${output}
+        DEPENDS
+          ${input}
+          ${SRC_HEADERS}
+          ${SRC_KERNELS_CUDA_HEADERS}
+          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_BVH_HEADERS}
+          ${SRC_SVM_HEADERS}
+          ${SRC_GEOM_HEADERS}
+          ${SRC_CLOSURE_HEADERS}
+          ${SRC_UTIL_HEADERS}
+        COMMAND
+          ${CUDA_NVCC_EXECUTABLE}
+          --ptx
+          -arch=sm_30
+          ${cuda_flags}
+          ${input}
+        WORKING_DIRECTORY
+          "${CMAKE_CURRENT_SOURCE_DIR}")
+    endif()
     list(APPEND optix_ptx ${output})
 
     delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
index 4f508d7cdaac34527ac656473d17a728536e07b6..3c5a10540d5c4f429cb73644235d3e567e45ddd6 100644 (file)
@@ -37,8 +37,11 @@ typedef unsigned long long uint64_t;
 typedef unsigned short half;
 typedef unsigned long long CUtexObject;
 
-#define FLT_MIN 1.175494350822287507969e-38f
-#define FLT_MAX 340282346638528859811704183484516925440.0f
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
 
 __device__ half __float2half(const float f)
 {
index 61b9d87a020c5b8939bb5519cff50a6406091796..7068acc3a322562989b09068e1e610d7ec79f35f 100644 (file)
@@ -35,9 +35,11 @@ typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 typedef unsigned short half;
 typedef unsigned long long CUtexObject;
-
-#define FLT_MIN 1.175494350822287507969e-38f
-#define FLT_MAX 340282346638528859811704183484516925440.0f
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
 
 __device__ half __float2half(const float f)
 {
index 2dca1d64a76045764777465fe7a01080022306ef..d809f2e06d79659cdd67b35709ba79e01d48f951 100644 (file)
@@ -24,9 +24,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_OPENCL__
+#if defined(__KERNEL_OPENCL__) || defined(CYCLES_CUBIN_CC)
 #  define static_assert(statement, message)
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_OPENCL__ */
 
 #define static_assert_align(st, align) \
   static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT