Cycles: prepare to make CUDA 5.0 the official version we use
authorBrecht Van Lommel <brechtvanlommel@pandora.be>
Wed, 19 Jun 2013 17:54:23 +0000 (17:54 +0000)
committerBrecht Van Lommel <brechtvanlommel@pandora.be>
Wed, 19 Jun 2013 17:54:23 +0000 (17:54 +0000)
* Add CUDA compiler version detection to cmake/scons/runtime
* Remove noinline in kernel_shader.h and reenable --use_fast_math if CUDA 5.x
  is used, these were workarounds for CUDA 4.2 bugs
* Change max number of registers to 32 for sm 2.x (based on performance tests
  from Martijn Berger and confirmed here), and also for NVidia OpenCL.

Overall it seems that with these changes and the latest CUDA 5.0 download, that
performance is as good as or better than the 2.67b release with the scenes and
graphics cards I tested.

intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_opencl.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/SConscript
intern/cycles/kernel/kernel_jitter.h
intern/cycles/kernel/kernel_shader.h
intern/cycles/util/util_cuda.cpp
intern/cycles/util/util_cuda.h

index f32c6dde63958b7c5c2f6a63598250ff74b33e28..1f96ed0ae839dbf816701d1330bff9939c47e01d 100644 (file)
@@ -271,11 +271,53 @@ public:
                        return "";
                }
 
+               int cuda_version = cuCompilerVersion();
+
+               if(cuda_version == 0) {
+                       cuda_error_message("CUDA nvcc compiler version could not be parsed.");
+                       return "";
+               }
+
+               if(cuda_version != 50)
+                       printf("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
+
                /* compile */
                string kernel = path_join(kernel_path, "kernel.cu");
                string include = kernel_path;
                const int machine = system_cpu_bits();
-               const int maxreg = 24;
+               string arch_flags;
+
+               /* build flags depending on CUDA version and arch */
+               if(cuda_version < 50) {
+                       /* CUDA 4.x */
+                       if(major == 1) {
+                               /* sm_1x */
+                               arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0";
+                       }
+                       else if(major == 2) {
+                               /* sm_2x */
+                               arch_flags = "--maxrregcount=24";
+                       }
+                       else {
+                               /* sm_3x */
+                               arch_flags = "--maxrregcount=32";
+                       }
+               }
+               else {
+                       /* CUDA 4.x */
+                       if(major == 1) {
+                               /* sm_1x */
+                               arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math";
+                       }
+                       else if(major == 2) {
+                               /* sm_2x */
+                               arch_flags = "--maxrregcount=32 --use_fast_math";
+                       }
+                       else {
+                               /* sm_3x */
+                               arch_flags = "--maxrregcount=32 --use_fast_math";
+                       }
+               }
 
                double starttime = time_dt();
                printf("Compiling CUDA kernel ...\n");
@@ -283,8 +325,10 @@ public:
                path_create_directories(cubin);
 
                string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
-                       "-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
-                       nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
+                       "-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
+                       nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
+
+               printf("%s\n", command.c_str());
 
                if(system(command.c_str()) == -1) {
                        cuda_error_message("Failed to execute compilation command, see console for details.");
index 0fb5c7abafb7b16d2123a7901eb711c47c746f03..2ee4ffaca1721bc1876cf2f37b012d7e44530092 100644 (file)
@@ -85,7 +85,7 @@ static string opencl_kernel_build_options(const string& platform, const string *
        string build_options = " -cl-fast-relaxed-math ";
 
        if(platform == "NVIDIA CUDA")
-               build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=24 -cl-nv-verbose ";
+               build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=32 -cl-nv-verbose ";
 
        else if(platform == "Apple")
                build_options += "-D__KERNEL_OPENCL_APPLE__ -Wno-missing-prototypes ";
index 8b4466863e0035d164378509a6708838a59ee43f..5e9dd15b812e28b93fa1b0e1adb512b1344835ad 100644 (file)
@@ -117,32 +117,68 @@ set(SRC_UTIL_HEADERS
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
+       # 32 bit or 64 bit
        if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
                set(CUDA_BITS 64)
        else()
                set(CUDA_BITS 32)
        endif()
 
+       # CUDA version
+       execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+       string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
+       string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
+       set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
+
+       # build for each arch
        set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
        set(cuda_cubins)
 
        foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
                set(cuda_cubin kernel_${arch}.cubin)
 
-               if(${arch} MATCHES "sm_1[0-9]")
-                       # sm_1x
-                       set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
-               elseif(${arch} MATCHES "sm_2[0-9]")
-                       # sm_2x
-                       set(cuda_arch_flags "--maxrregcount=24")
+               set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
+
+               # warn for other versions
+               if(CUDA_VERSION MATCHES "50")
+               else()
+                       message(STATUS "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, build may succeed but only CUDA 5.0 is officially supported")
+               endif()
+
+               # build flags depending on CUDA version and arch
+               if(CUDA_VERSION LESS 50)
+                       # CUDA 4.x
+                       if(${arch} MATCHES "sm_1[0-9]")
+                               # sm_1x
+                               set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
+                       elseif(${arch} MATCHES "sm_2[0-9]")
+                               # sm_2x
+                               set(cuda_arch_flags "--maxrregcount=24")
+                       else()
+                               # sm_3x
+                               set(cuda_arch_flags "--maxrregcount=32")
+                       endif()
+
+                       set(cuda_math_flags "")
                else()
-                       # sm_3x
-                       set(cuda_arch_flags "--maxrregcount=32")
+                       # CUDA 5.x
+                       if(${arch} MATCHES "sm_1[0-9]")
+                               # sm_1x
+                               set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
+                       elseif(${arch} MATCHES "sm_2[0-9]")
+                               # sm_2x
+                               set(cuda_arch_flags "--maxrregcount=32")
+                       else()
+                               # sm_3x
+                               set(cuda_arch_flags "--maxrregcount=32")
+                       endif()
+
+                       set(cuda_math_flags "--use_fast_math")
                endif()
                
                add_custom_command(
                        OUTPUT ${cuda_cubin}
-                       COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
+                       COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
                        DEPENDS ${cuda_sources})
 
                delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
index 3a46d10dee190051d8ee5a2e0db928fb8a185843..353ec1ce9d842f8da04dcfabc08558abd710a117 100644 (file)
@@ -25,6 +25,8 @@
 #
 # ***** END GPL LICENSE BLOCK *****
 
+import re
+import subprocess
 import sys
 import os
 import Blender as B
@@ -60,10 +62,19 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     svm_dir = os.path.join(source_dir, "../svm")
     closure_dir = os.path.join(source_dir, "../closure")
 
+    # get CUDA version
+    nvcc_pipe = subprocess.Popen([nvcc, "--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    output, erroroutput = nvcc_pipe.communicate()
+    cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
+    cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1])
+
+    if cuda_version != 50:
+        print("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported." % (cuda_version/10, cuda_version%10))
+
     # nvcc flags
     nvcc_flags = "-m%s" % (bits)
-    nvcc_flags += " --cubin --ptxas-options=\"-v\" --maxrregcount=24"
-    nvcc_flags += " --opencc-options -OPT:Olimit=0"
+    nvcc_flags += " --cubin --ptxas-options=\"-v\""
+    nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version)
     nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
     nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, closure_dir)
 
@@ -75,7 +86,31 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     for arch in cuda_archs:
         cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
 
-        command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
+               # build flags depending on CUDA version and arch
+        if cuda_version < 50:
+            # CUDA 4.x
+            if arch.startswith("sm_1"):
+                # sm_1x
+                cuda_arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0"
+            elif arch.startswith("sm_2"):
+                # sm_2x
+                cuda_arch_flags = "--maxrregcount=24"
+            else:
+                # sm_3x
+                cuda_arch_flags = "--maxrregcount=32"
+        else:
+            # CUDA 5.x
+            if arch.startswith("sm_1"):
+                # sm_1x
+                cuda_arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math"
+            elif arch.startswith("sm_2"):
+                # sm_2x
+                cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
+            else:
+                # sm_3x
+                cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
+
+        command = "\"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
 
         kernel.Command(cubin_file, 'kernel.cu', command)
         kernel.Depends(cubin_file, dependencies)
index 17704b48cee7c8357e23b9ba0a784830e4dc5d72..3e1a18ab469fdba1a670fa5a3d8437d0dbf99d3d 100644 (file)
@@ -137,7 +137,7 @@ __device_inline float cmj_randfloat(uint i, uint p)
 }
 
 #ifdef __CMJ__
-__device_noinline float cmj_sample_1D(int s, int N, int p)
+__device float cmj_sample_1D(int s, int N, int p)
 {
        uint x = cmj_permute(s, N, p * 0x68bc21eb);
        float jx = cmj_randfloat(s, p * 0x967a889b);
@@ -146,7 +146,7 @@ __device_noinline float cmj_sample_1D(int s, int N, int p)
        return (x + jx)*invN;
 }
 
-__device_noinline void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+__device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 {
        int m = float_to_int(sqrtf(N));
        int n = (N + m - 1)/m;
index 7e41ee35ae044f0fcb06adbf5ec6965b62a73441..039981a031a4c0ecf4fcbd94d9d450ee23b756c1 100644 (file)
@@ -38,7 +38,12 @@ CCL_NAMESPACE_BEGIN
 /* ShaderData setup from incoming ray */
 
 #ifdef __OBJECT_MOTION__
-__device_noinline void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
+#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
+__device_noinline
+#else
+__device
+#endif
+void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
        /* note that this is a separate non-inlined function to work around crash
         * on CUDA sm 2.0, otherwise kernel execution crashes (compiler bug?) */
@@ -53,7 +58,12 @@ __device_noinline void shader_setup_object_transforms(KernelGlobals *kg, ShaderD
 }
 #endif
 
-__device_noinline void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
+#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
+__device_noinline
+#else
+__device
+#endif
+void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
        const Intersection *isect, const Ray *ray)
 {
 #ifdef __INSTANCING__
@@ -260,7 +270,12 @@ __device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData
 
 /* ShaderData setup from position sampled on mesh */
 
-__device_noinline void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
+#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
+__device_noinline
+#else
+__device
+#endif
+void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
        const float3 P, const float3 Ng, const float3 I,
        int shader, int object, int prim, float u, float v, float t, float time, int segment)
 {
index 6c9ee7c548ff790232649b35e11beed1efb05231..42ffb04a79355afac09d75f1429d30b8b622fc68 100644 (file)
@@ -16,6 +16,8 @@
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
+#include <iostream>
+
 #include <stdlib.h>
 #include <stdio.h>
 
 #include "util_path.h"
 #include "util_string.h"
 
+#ifdef _WIN32
+#define popen _popen
+#define pclose _pclose
+#endif
+
 /* function defininitions */
 
 tcuInit *cuInit;
@@ -399,7 +406,15 @@ string cuCompilerPath()
        const char *defaultpaths[] = {"C:/CUDA/bin", NULL};
        const char *executable = "nvcc.exe";
 #else
-       const char *defaultpaths[] = {"/Developer/NVIDIA/CUDA-4.2/bin", "/usr/local/cuda-4.2/bin", "/usr/local/cuda/bin", NULL};
+       const char *defaultpaths[] = {
+               "/Developer/NVIDIA/CUDA-5.0/bin",
+               "/usr/local/cuda-5.0/bin",
+               "/usr/local/cuda/bin",
+               "/Developer/NVIDIA/CUDA-4.2/bin",
+               "/usr/local/cuda-4.2/bin", 
+               "/Developer/NVIDIA/CUDA-5.5/bin",
+               "/usr/local/cuda-5.5/bin",
+               NULL};
        const char *executable = "nvcc";
 #endif
 
@@ -437,5 +452,46 @@ string cuCompilerPath()
        return "";
 }
 
+int cuCompilerVersion()
+{
+       string path = cuCompilerPath();
+       if(path == "")
+               return 0;
+       
+       /* get --version output */
+       FILE *pipe = popen((path + " --version").c_str(), "r");
+       if(!pipe) {
+               fprintf(stderr, "CUDA: failed to run compiler to retrieve version");
+               return 0;
+       }
+
+       char buf[128];
+       string output = "";
+
+       while(!feof(pipe))
+               if(fgets(buf, 128, pipe) != NULL)
+                       output += buf;
+
+       pclose(pipe);
+
+       /* parse version number */
+       string marker = "Cuda compilation tools, release ";
+       size_t offset = output.find(marker);
+       if(offset == string::npos) {
+               fprintf(stderr, "CUDA: failed to find version number in:\n\n%s\n", output.c_str());
+               return 0;
+       }
+
+       string versionstr = output.substr(offset + marker.size(), string::npos);
+       int major, minor;
+
+       if(sscanf(versionstr.c_str(), "%d.%d", &major, &minor) < 2) {
+               fprintf(stderr, "CUDA: failed to parse version number from:\n\n%s\n", output.c_str());
+               return 0;
+       }
+
+       return 10*major + minor;
+}
+
 CCL_NAMESPACE_END
 
index 55feb3f84cb4ab06cf17b5693694171cfb46dcd2..ba4df0de90ae7ca63af4aa79bf4e1a26d5cfc8a2 100644 (file)
@@ -32,6 +32,7 @@ CCL_NAMESPACE_BEGIN
 bool cuLibraryInit();
 bool cuHavePrecompiledKernels();
 string cuCompilerPath();
+int cuCompilerVersion();
 
 CCL_NAMESPACE_END