Merge remote-tracking branch 'origin/master' into blender2.8
authorDalai Felinto <dfelinto@gmail.com>
Mon, 3 Oct 2016 20:54:22 +0000 (20:54 +0000)
committerDalai Felinto <dfelinto@gmail.com>
Mon, 3 Oct 2016 20:54:22 +0000 (20:54 +0000)
24 files changed:
CMakeLists.txt
build_files/buildbot/slave_compile.py
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/bvh/bvh.h
intern/cycles/kernel/bvh/bvh_nodes.h
intern/cycles/kernel/bvh/bvh_traversal.h
intern/cycles/kernel/bvh/bvh_types.h
intern/cycles/kernel/closure/bsdf.h
intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
intern/cycles/kernel/closure/bsdf_microfacet.h
intern/cycles/kernel/closure/bsdf_microfacet_multi.h
intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
intern/cycles/kernel/closure/bssrdf.h
intern/cycles/kernel/geom/geom_curve.h
intern/cycles/kernel/kernel_accumulate.h
intern/cycles/kernel/kernel_compat_cuda.h
intern/cycles/kernel/kernel_compat_opencl.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_branched.h
intern/cycles/kernel/kernel_random.h
intern/cycles/kernel/kernel_shadow.h
intern/cycles/kernel/osl/osl_services.cpp
intern/cycles/util/util_types.h
intern/elbeem/intern/solver_main.cpp

index e061fab6b52ee75411c25f6f05f728616358565b..9331418e94e5f2532e347329a8ca295710e57c27 100644 (file)
@@ -404,7 +404,7 @@ option(WITH_CYCLES_STANDALONE_GUI   "Build Cycles standalone with GUI" OFF)
 option(WITH_CYCLES_OSL                         "Build Cycles with OSL support" ${_init_CYCLES_OSL})
 option(WITH_CYCLES_OPENSUBDIV          "Build Cycles with OpenSubdiv support" ${_init_CYCLES_OPENSUBDIV})
 option(WITH_CYCLES_CUDA_BINARIES       "Build Cycles CUDA binaries" OFF)
-set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 CACHE STRING "CUDA architectures to build binaries for")
+set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 CACHE STRING "CUDA architectures to build binaries for")
 mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
 unset(PLATFORM_DEFAULT)
 option(WITH_CYCLES_LOGGING     "Build Cycles with logging support" ON)
index b156e4be327a6f814ac4f28eac6f90c99aa4756d..c2bfd882fde22e5d01a8ce856fb429ac44336baa 100644 (file)
@@ -72,8 +72,7 @@ if 'cmake' in builder:
         # Set up OSX architecture
         if builder.endswith('x86_64_10_6_cmake'):
             cmake_extra_options.append('-DCMAKE_OSX_ARCHITECTURES:STRING=x86_64')
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-hack/bin/nvcc')
-        cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE=/usr/local/cuda8-hack/bin/nvcc')
+        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda8-hack/bin/nvcc')
         cmake_extra_options.append('-DWITH_CODEC_QUICKTIME=OFF')
         cmake_extra_options.append('-DCMAKE_OSX_DEPLOYMENT_TARGET=10.6')
         build_cubins = False
@@ -94,8 +93,7 @@ if 'cmake' in builder:
             elif builder.startswith('win32'):
                 bits = 32
                 cmake_options.extend(['-G', 'Visual Studio 12 2013'])
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v7.5/bin/nvcc.exe')
-        cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/nvcc.exe')
+        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/nvcc.exe')
 
     elif builder.startswith('linux'):
         tokens = builder.split("_")
@@ -115,8 +113,7 @@ if 'cmake' in builder:
             cuda_chroot_name = 'buildbot_' + deb_name + '_x86_64'
             targets = ['player', 'blender', 'cuda']
 
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-7.5/bin/nvcc')
-        cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE=/usr/local/cuda-8.0/bin/nvcc')
+        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-8.0/bin/nvcc')
 
     cmake_options.append("-C" + os.path.join(blender_dir, cmake_config_file))
 
index 06089b4014ad6386e3bcdcb56bf223e441a0f8aa..e4341c8aca172c915da81d13d192d9cc9527db3b 100644 (file)
@@ -215,11 +215,11 @@ if(WITH_CYCLES_CUDA_BINARIES)
        set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
        # warn for other versions
-       if(CUDA_VERSION MATCHES "75")
+       if(CUDA_VERSION MATCHES "80")
        else()
                message(WARNING
                        "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-                       "build may succeed but only CUDA 7.5 is officially supported")
+                       "build may succeed but only CUDA 8.0 is officially supported")
        endif()
 
        # build for each arch
@@ -251,11 +251,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
                set(cuda_nvcc_command ${CUDA_NVCC_EXECUTABLE})
                set(cuda_nvcc_version ${CUDA_VERSION})
 
-               if(DEFINED CUDA_NVCC8_EXECUTABLE  AND ((${arch} STREQUAL "sm_60") OR (${arch} STREQUAL "sm_61")))
-                       set(cuda_nvcc_command ${CUDA_NVCC8_EXECUTABLE})
-                       set(cuda_nvcc_version "80")
-               endif()
-
                set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
                set(cuda_math_flags "--use_fast_math")
 
index 7cecee793c1263595732e9df678744eabe9eff9f..3679898265363ef798cdfde732b82b1b74fc7cdf 100644 (file)
@@ -157,8 +157,9 @@ CCL_NAMESPACE_BEGIN
 #undef BVH_NAME_EVAL
 #undef BVH_FUNCTION_FULL_NAME
 
+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
-                                          const Ray *ray,
+                                          const Ray ray,
                                           const uint visibility,
                                           Intersection *isect,
                                           uint *lcg_state,
@@ -169,32 +170,32 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
        if(kernel_data.bvh.have_motion) {
 #  ifdef __HAIR__
                if(kernel_data.bvh.have_curves)
-                       return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
+                       return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax);
 #  endif /* __HAIR__ */
 
-               return bvh_intersect_motion(kg, ray, isect, visibility);
+               return bvh_intersect_motion(kg, &ray, isect, visibility);
        }
 #endif /* __OBJECT_MOTION__ */
 
 #ifdef __HAIR__
        if(kernel_data.bvh.have_curves)
-               return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
+               return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax);
 #endif /* __HAIR__ */
 
 #ifdef __KERNEL_CPU__
 
 #  ifdef __INSTANCING__
        if(kernel_data.bvh.have_instancing)
-               return bvh_intersect_instancing(kg, ray, isect, visibility);
+               return bvh_intersect_instancing(kg, &ray, isect, visibility);
 #  endif /* __INSTANCING__ */
 
-       return bvh_intersect(kg, ray, isect, visibility);
+       return bvh_intersect(kg, &ray, isect, visibility);
 #else /* __KERNEL_CPU__ */
 
 #  ifdef __INSTANCING__
-       return bvh_intersect_instancing(kg, ray, isect, visibility);
+       return bvh_intersect_instancing(kg, &ray, isect, visibility);
 #  else
-       return bvh_intersect(kg, ray, isect, visibility);
+       return bvh_intersect(kg, &ray, isect, visibility);
 #  endif /* __INSTANCING__ */
 
 #endif /* __KERNEL_CPU__ */
index db2275b0ff8f7d619759be9180ca6a4e79f34548..726bef1794cc2d8948af24da6561914e0ed91414 100644 (file)
@@ -16,7 +16,7 @@
 
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
-ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
                                                            int node_addr,
                                                            int child)
 {
@@ -30,7 +30,7 @@ ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
 }
 
 #if !defined(__KERNEL_SSE2__)
-ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
                                                  const float3 P,
                                                  const float3 idir,
                                                  const float t,
@@ -77,7 +77,7 @@ ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
 #endif
 }
 
-ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
                                                         const float3 P,
                                                         const float3 idir,
                                                         const float t,
@@ -139,7 +139,7 @@ ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 #endif
 }
 
-ccl_device_inline bool bvh_unaligned_node_intersect_child(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(
         KernelGlobals *kg,
         const float3 P,
         const float3 dir,
@@ -166,7 +166,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child(
        return tnear <= tfar;
 }
 
-ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
         KernelGlobals *kg,
         const float3 P,
         const float3 dir,
@@ -202,7 +202,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
        }
 }
 
-ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                    const float3 P,
                                                    const float3 dir,
                                                    const float3 idir,
@@ -232,7 +232,7 @@ ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
        return mask;
 }
 
-ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
                                                           const float3 P,
                                                           const float3 dir,
                                                           const float3 idir,
@@ -264,7 +264,7 @@ ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
        return mask;
 }
 
-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
                                          const float3 P,
                                          const float3 dir,
                                          const float3 idir,
@@ -295,7 +295,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
        }
 }
 
-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                 const float3 P,
                                                 const float3 dir,
                                                 const float3 idir,
@@ -333,7 +333,7 @@ ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
 }
 #else  /* !defined(__KERNEL_SSE2__) */
 
-int ccl_device_inline bvh_aligned_node_intersect(
+int ccl_device_forceinline bvh_aligned_node_intersect(
         KernelGlobals *kg,
         const float3& P,
         const float3& dir,
@@ -377,7 +377,7 @@ int ccl_device_inline bvh_aligned_node_intersect(
 #  endif
 }
 
-int ccl_device_inline bvh_aligned_node_intersect_robust(
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(
         KernelGlobals *kg,
         const float3& P,
         const float3& dir,
@@ -441,7 +441,7 @@ int ccl_device_inline bvh_aligned_node_intersect_robust(
 #  endif
 }
 
-int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                    const float3 P,
                                                    const float3 dir,
                                                    const ssef& isect_near,
@@ -502,7 +502,7 @@ int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
 #  endif
 }
 
-int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
                                                           const float3 P,
                                                           const float3 dir,
                                                           const ssef& isect_near,
@@ -573,7 +573,7 @@ int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
 #  endif
 }
 
-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
                                          const float3& P,
                                          const float3& dir,
                                          const ssef& isect_near,
@@ -611,7 +611,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
        }
 }
 
-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                 const float3& P,
                                                 const float3& dir,
                                                 const ssef& isect_near,
index b1a52968a2623d7f99ada8fb04fe9a3e5c026c9f..a0e478e972b4bdf4ef2010cc5dc9ea34482e4416 100644 (file)
  *
  */
 
-#ifndef __KERNEL_GPU__
-ccl_device
-#else
-ccl_device_inline
-#endif
-bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                 const Ray *ray,
-                                 Intersection *isect,
-                                 const uint visibility
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     const uint visibility
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                 , uint *lcg_state,
-                                 float difl,
-                                 float extmax
+                                                     , uint *lcg_state,
+                                                     float difl,
+                                                     float extmax
 #endif
-                                 )
+                                                     )
 {
        /* todo:
         * - test if pushing distance on the stack helps (for non shadow rays)
index 27729046f8d099c85ca33b0dd4e8388ff288946c..c3abe2e157d5821e98ea24f45bdd7bf648180bb1 100644 (file)
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Don't inline intersect functions on GPU, this is faster */
 #ifdef __KERNEL_GPU__
-#  define ccl_device_intersect ccl_device_noinline
+#  define ccl_device_intersect ccl_device_forceinline
 #else
 #  define ccl_device_intersect ccl_device_inline
 #endif
index 2ab92badc93806e328ecdfaba4339c7ca5891d4b..7e4d5fe2e377838856abfe16c0fc04457e2bc947 100644 (file)
@@ -36,7 +36,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
                                   ShaderData *sd,
                                   const ShaderClosure *sc,
                                   float randu,
@@ -147,7 +147,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #ifndef __KERNEL_CUDA__
 ccl_device
 #else
-ccl_device_inline
+ccl_device_forceinline
 #endif
 float3 bsdf_eval(KernelGlobals *kg,
                  ShaderData *sd,
index 9929246ae5cbe4f34101c2ff47d5bc51930650c6..1cd8246aa71fb48d995c4a33ce0e151c7cf6dd66 100644 (file)
@@ -62,7 +62,7 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
        return 2.0f / (roughness*roughness) - 2.0f;
 }
 
-ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect(
+ccl_device_forceinline float3 bsdf_ashikhmin_shirley_eval_reflect(
         const ShaderClosure *sc,
         const float3 I,
         const float3 omega_in,
index 7c36f05b6cc09a12bb575661ec376e105cff427d..0a8d14a00c2290641b62da04f1b2e76cf1846b96 100644 (file)
@@ -183,7 +183,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(
        *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x));
 }
 
-ccl_device_inline float3 microfacet_sample_stretched(
+ccl_device_forceinline float3 microfacet_sample_stretched(
        KernelGlobals *kg, const float3 omega_i,
        const float alpha_x, const float alpha_y,
        const float randu, const float randv,
index 0a6dd4dcbdff722ee92c45bc67cfc3e43779768c..cea59adfebefc0cb06bf9f91cb2947880f7bbb3d 100644 (file)
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 /* === GGX Microfacet distribution functions === */
 
 /* Isotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx(float3 wm, float alpha)
+ccl_device_forceinline float D_ggx(float3 wm, float alpha)
 {
        wm.z *= wm.z;
        alpha *= alpha;
@@ -30,7 +30,7 @@ ccl_device_inline float D_ggx(float3 wm, float alpha)
 }
 
 /* Anisotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
+ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 {
        float slope_x = -wm.x/alpha.x;
        float slope_y = -wm.y/alpha.y;
@@ -40,7 +40,7 @@ ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
 }
 
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
        if(cosI > 0.9999f || cosI < 1e-6f) {
                const float r = sqrtf(randU.x / (1.0f - randU.x));
@@ -78,7 +78,7 @@ ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 }
 
 /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
 {
        const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
        const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
@@ -94,7 +94,7 @@ ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, con
 /* === Phase functions: Glossy, Diffuse and Glass === */
 
 /* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
 {
        if(n && k)
                *weight *= fresnel_conductor(dot(wi, wm), *n, *k);
@@ -102,7 +102,7 @@ ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, floa
        return -wi + 2.0f * wm * dot(wi, wm);
 }
 
-ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
 {
        if(w.z > 0.9999f)
                return make_float3(0.0f, 0.0f, 0.0f);
@@ -132,7 +132,7 @@ ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda
 }
 
 /* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
+ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
 {
        float3 tm, bm;
        make_orthonormals(wm, &tm, &bm);
@@ -141,14 +141,14 @@ ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float ra
        return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
 }
 
-ccl_device_inline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
+ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
 {
        const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
        return make_float3(v, v, v);
 }
 
 /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
+ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
        float cosI = dot(wi, wm);
        float f = fresnel_dielectric_cos(cosI, eta);
@@ -162,7 +162,7 @@ ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta,
        return normalize(wm*(cosI*inv_eta + cosT) - wi*inv_eta);
 }
 
-ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
+ccl_device_forceinline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
 {
        if(w.z > 0.9999f)
                return make_float3(0.0f, 0.0f, 0.0f);
@@ -195,7 +195,7 @@ ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda,
 /* === Utility functions for the random walks === */
 
 /* Smith Lambda function for GGX (based on page 12 of the supplemental implementation). */
-ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
+ccl_device_forceinline float mf_lambda(const float3 w, const float2 alpha)
 {
        if(w.z > 0.9999f)
                return 0.0f;
@@ -212,18 +212,18 @@ ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
 }
 
 /* Height distribution CDF (based on page 4 of the supplemental implementation). */
-ccl_device_inline float mf_invC1(const float h)
+ccl_device_forceinline float mf_invC1(const float h)
 {
        return 2.0f * saturate(h) - 1.0f;
 }
 
-ccl_device_inline float mf_C1(const float h)
+ccl_device_forceinline float mf_C1(const float h)
 {
        return saturate(0.5f * (h + 1.0f));
 }
 
 /* Masking function (based on page 16 of the supplemental implementation). */
-ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda)
+ccl_device_forceinline float mf_G1(const float3 w, const float C1, const float lambda)
 {
        if(w.z > 0.9999f)
                return 1.0f;
@@ -233,7 +233,7 @@ ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda
 }
 
 /* Sampling from the visible height distribution (based on page 17 of the supplemental implementation). */
-ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
+ccl_device_forceinline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
 {
        if(w.z > 0.9999f)
                return false;
@@ -262,14 +262,14 @@ ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, flo
 
 /* Approximation for the albedo of the single-scattering GGX distribution,
  * the missing energy is then approximated as a diffuse reflection for the PDF. */
-ccl_device_inline float mf_ggx_albedo(float r)
+ccl_device_forceinline float mf_ggx_albedo(float r)
 {
        float albedo = 0.806495f*expf(-1.98712f*r*r) + 0.199531f;
        albedo -= ((((((1.76741f*r - 8.43891f)*r + 15.784f)*r - 14.398f)*r + 6.45221f)*r - 1.19722f)*r + 0.027803f)*r + 0.00568739f;
        return saturate(albedo);
 }
 
-ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
+ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
        float D = D_ggx(normalize(wi+wo), alpha);
        float lambda = mf_lambda(wi, make_float2(alpha, alpha));
@@ -277,17 +277,17 @@ ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float
        return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
 }
 
-ccl_device_inline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
+ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
 {
        return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
 }
 
-ccl_device_inline float mf_diffuse_pdf(const float3 wo)
+ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
 {
        return M_1_PI_F * wo.z;
 }
 
-ccl_device_inline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
+ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
        float3 wh;
        float fresnel;
index 6ebe2f6a7519584519176d07d714a21fa3dae1ab..8054fa8e84932e8ba6f8491c8e7b5da1b7cbff1b 100644 (file)
@@ -25,7 +25,7 @@
  * energy is used. In combination with MIS, that is enough to produce an unbiased result, although
  * the balance heuristic isn't necessarily optimal anymore.
  */
-ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
         float3 wi,
         float3 wo,
         const bool wo_outside,
@@ -168,7 +168,7 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
  * escaped the surface in wo. The function returns the throughput between wi and wo.
  * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
  */
-ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
 #ifdef MF_MULTI_GLASS
        , const float eta
 #elif defined(MF_MULTI_GLOSSY)
index 35c95768b69325d9be4891a497d5f598c98d12cc..af0bbd861a9f4679da143c775ff2705ddd9f1207 100644 (file)
@@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
 }
 
 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
 {
        /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
         * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
  * Returns scaled radius, meaning the result is to be scaled up by d.
  * Since there's no closed form solution we do Newton-Raphson method to find it.
  */
-ccl_device_inline float bssrdf_burley_root_find(float xi)
+ccl_device_forceinline float bssrdf_burley_root_find(float xi)
 {
        const float tolerance = 1e-6f;
        const int max_iteration_count = 10;
@@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
                bssrdf_burley_sample(sc, xi, r, h);
 }
 
-ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r)
+ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 {
        if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
                return bssrdf_cubic_pdf(sc, r);
index aa9cd295452a6def2a360de42a0208975d4df2c3..84aaaab7453b86c13bcd9f1b9ad3129ea3cf8a96 100644 (file)
@@ -222,10 +222,10 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
 
 #ifdef __KERNEL_SSE2__
 /* Pass P and dir by reference to aligned vector */
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
        const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 #else
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
        float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
 #endif
 {
@@ -621,7 +621,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
        return hit;
 }
 
-ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
        float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 {
        /* define few macros to minimize code duplication for SSE */
index dd5752f4c91e90ce006fd0226320672737a20663..623c1dcaaa12392caabaf961f6cbca6752f725af 100644 (file)
@@ -54,13 +54,7 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
        }
 }
 
-/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */
-#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
 {
 #ifdef __PASSES__
        if(eval->use_light_pass) {
index 9a96cb9f4387284201b4bba30cc93acd514c117c..e0c7b17c6a064591b9c67d9224d3f7dd97fff0e8 100644 (file)
@@ -37,6 +37,7 @@
 /* Qualifier wrappers for different names on different devices */
 
 #define ccl_device  __device__ __inline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
 #  define ccl_device_inline  __device__ __forceinline__
 #else
index 2ae89dde7c42e806b7865ec7978b779a172600b0..f076e3a7d3769041fcb0f38a3bf4e2a54e3a0548 100644 (file)
@@ -33,6 +33,7 @@
 /* in opencl all functions are device functions, so leave this empty */
 #define ccl_device
 #define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device ccl_noinline
 #define ccl_may_alias
 #define ccl_constant __constant
index 903be4f09a0461facdd4d30709a39e1dc4fe2c6a..7558fb944788fce0c0d6b0c17aec5e57af81a844 100644 (file)
@@ -69,7 +69,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                Intersection isect;
                uint visibility = path_state_ray_visibility(kg, state);
                bool hit = scene_intersect(kg,
-                                          ray,
+                                          *ray,
                                           visibility,
                                           &isect,
                                           NULL,
@@ -655,9 +655,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
                        lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
                }
 
-               bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+               bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-               bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+               bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif
 
 #ifdef __KERNEL_DEBUG__
index e38c1a01f6b57d7684a09d57c6fe8b7c69683807..cdb07db587ac3980fa7b6b37059de74c27e342e4 100644 (file)
@@ -282,9 +282,9 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
                        lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
                }
 
-               bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+               bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-               bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+               bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif
 
 #ifdef __KERNEL_DEBUG__
index b534d9950c552c96749222633fd0a7219b77c9ae..4a76ffddbe71f303acabbb4d2903ff9bb034a655 100644 (file)
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
        return index;
 }
 
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
        if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -132,13 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng,
 #endif
 }
 
-/* Temporary workaround for Pascal cards, otherwise AA does not work properly. */
-#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ >= 600
-__device__ __forceinline__
-#else
-ccl_device_inline
-#endif
-void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
        if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -199,7 +193,7 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG
 
 /* Linear Congruential Generator */
 
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
 {
        /* implicit mod 2^32 */
        rng = (1103515245*(rng) + 12345);
index 95b57404a77ff8fa0e1f63df48f15a5cdf0fb903..2981f6ac566c52ac1f6d95676ebf4989052dca7e 100644 (file)
@@ -155,7 +155,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
        }
        else {
                Intersection isect;
-               blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+               blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
        }
 
 #ifdef __VOLUME__
@@ -205,7 +205,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
        Intersection *isect = &isect_object;
 #endif
 
-       bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
+       bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
 
 #ifdef __TRANSPARENT_SHADOWS__
        if(blocked && kernel_data.integrator.transparent_shadows) {
@@ -221,7 +221,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
                                if(bounce >= kernel_data.integrator.transparent_max_bounce)
                                        return true;
 
-                               if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
+                               if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
                                {
 #ifdef __VOLUME__
                                        /* attenuation for last line segment towards light */
index f61a9ec0fb1b776a167b41817556c23bba0b1bcd..0f3edcb7eaac8f5f52af044b517f680add5a4d54 100644 (file)
@@ -1153,7 +1153,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
        tracedata->sd.osl_globals = sd->osl_globals;
 
        /* raytrace */
-       return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
+       return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
 }
 
 
index 257c6ad749177f032871ac39b702f91319b76d0a..6af65f88a0255685e6f050a00f1853e0d88e3609 100644 (file)
@@ -42,6 +42,7 @@
 
 #if defined(_WIN32) && !defined(FREE_WINDOWS)
 #define ccl_device_inline static __forceinline
+#define ccl_device_forceinline static __forceinline
 #define ccl_align(...) __declspec(align(__VA_ARGS__))
 #ifdef __KERNEL_64_BIT__
 #define ccl_try_align(...) __declspec(align(__VA_ARGS__))
@@ -56,6 +57,7 @@
 #else
 
 #define ccl_device_inline static inline __attribute__((always_inline))
+#define ccl_device_forceinline static inline __attribute__((always_inline))
 #define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
 #ifndef FREE_WINDOWS64
 #define __forceinline inline __attribute__((always_inline))
index a338bb77b4c076c74562223418bcf7de6d181c89..68f7c04cd54621b5abf3c842c2070d7a0b0b4de5 100644 (file)
@@ -376,11 +376,11 @@ LbmFsgrSolver::mainLoop(const int lev)
   // main loop region
        const bool doReduce = true;
        const int gridLoopBound=1;
-       const int gDebugLevel = ::gDebugLevel;
        int calcNumInvIfCells = 0;
        LbmFloat calcInitialMass = 0;
        GRID_REGION_INIT();
 #if PARALLEL==1
+       const int gDebugLevel = ::gDebugLevel;
 #pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
          calcCurrentMass,calcCurrentVolume, \
@@ -868,10 +868,8 @@ LbmFsgrSolver::mainLoop(const int lev)
                        // physical drop model
                        if(mPartUsePhysModel) {
                                LbmFloat realWorldFac = (mLevel[lev].simCellSize / mLevel[lev].timestep);
-                               LbmFloat rux = (ux * realWorldFac);
-                               LbmFloat ruy = (uy * realWorldFac);
-                               LbmFloat ruz = (uz * realWorldFac);
-                               LbmFloat rl = norm(ntlVec3Gfx(rux,ruy,ruz));
+                               LbmVec ru(ux * realWorldFac, uy * realWorldFac, uz * realWorldFac);
+                               LbmFloat rl = norm(ru);
                                basethresh *= rl;
 
                                // reduce probability in outer region?
@@ -963,14 +961,15 @@ LbmFsgrSolver::mainLoop(const int lev)
                                // average normal & velocity 
                                // -> mostly along velocity dir, many into surface
                                // fluid velocity (not normalized!)
-                               LbmVec flvelVel = LbmVec(ux,uy,uz);
+                               LbmVec flvelVel(ux,uy,uz);
                                LbmFloat flvelLen = norm(flvelVel);
                                // surface normal
-                               LbmVec normVel = LbmVec(surfaceNormal[0],surfaceNormal[1],surfaceNormal[2]);
+                               LbmVec normVel(surfaceNormal[0],surfaceNormal[1],surfaceNormal[2]);
                                normalize(normVel);
                                LbmFloat normScale = (0.01+flvelLen);
                                // jitter vector, 0.2 * flvel
-                               LbmVec jittVel = LbmVec(jx,jy,jz)*(0.05+flvelLen)*0.1;
+                               LbmVec jittVel(jx,jy,jz);
+                               jittVel *= (0.05+flvelLen)*0.1;
                                // weighten velocities
                                const LbmFloat flvelWeight = 0.9;
                                LbmVec newpartVel = normVel*normScale*(1.-flvelWeight) + flvelVel*(flvelWeight) + jittVel; 
@@ -1120,13 +1119,13 @@ LbmFsgrSolver::preinitGrids()
        const int lev = mMaxRefine;
        const bool doReduce = false;
        const int gridLoopBound=0;
-       const int gDebugLevel = ::gDebugLevel;
 
        // preinit both grids
        for(int s=0; s<2; s++) {
        
                GRID_REGION_INIT();
 #if PARALLEL==1
+       const int gDebugLevel = ::gDebugLevel;
 #pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
          calcCurrentMass,calcCurrentVolume, \
@@ -1161,10 +1160,10 @@ LbmFsgrSolver::standingFluidPreinit()
        const int lev = mMaxRefine;
        const bool doReduce = false;
        const int gridLoopBound=1;
-       const int gDebugLevel = ::gDebugLevel;
 
        GRID_REGION_INIT();
 #if PARALLEL==1
+       const int gDebugLevel = ::gDebugLevel;
 #pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
          calcCurrentMass,calcCurrentVolume, \