Cycles: Tweaks to support CUDA 8 toolkit
authorSergey Sharybin <sergey.vfx@gmail.com>
Mon, 1 Aug 2016 13:40:46 +0000 (15:40 +0200)
committerSergey Sharybin <sergey.vfx@gmail.com>
Mon, 1 Aug 2016 13:54:29 +0000 (15:54 +0200)
All the changes are mainly giving explicit tips on inlining functions,
so they match how inlining worked with previous toolkit.

This make kernel compiled by CUDA 8 render in average with same speed
as previous kernels. Some scenes are somewhat faster, some of them are
somewhat slower. But slowdown is within 1% so far.

On a positive side it allows us to enable newer generation cards on
buildbots (so GTX 10x0 will be officially supported soon).

29 files changed:
intern/cycles/device/device_cuda.cpp
intern/cycles/kernel/bvh/bvh_shadow_all.h
intern/cycles/kernel/bvh/bvh_subsurface.h
intern/cycles/kernel/bvh/bvh_traversal.h
intern/cycles/kernel/bvh/bvh_volume.h
intern/cycles/kernel/bvh/bvh_volume_all.h
intern/cycles/kernel/closure/bsdf.h
intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
intern/cycles/kernel/closure/bssrdf.h
intern/cycles/kernel/geom/geom_primitive.h
intern/cycles/kernel/geom/geom_volume.h
intern/cycles/kernel/kernel_bake.h
intern/cycles/kernel/kernel_camera.h
intern/cycles/kernel/kernel_compat_cuda.h
intern/cycles/kernel/kernel_light.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_surface.h
intern/cycles/kernel/kernel_path_volume.h
intern/cycles/kernel/kernel_projection.h
intern/cycles/kernel/kernel_shader.h
intern/cycles/kernel/kernel_subsurface.h
intern/cycles/kernel/kernel_volume.h
intern/cycles/kernel/svm/svm_attribute.h
intern/cycles/kernel/svm/svm_geometry.h
intern/cycles/kernel/svm/svm_ramp.h
intern/cycles/kernel/svm/svm_ramp_util.h
intern/cycles/kernel/svm/svm_tex_coord.h
intern/cycles/util/util_math.h

index 2d404918a38be9a7a4ca0c5d9da92fb4977543e5..80f2644fa8cb92e829b66057f34ce294f5fa557e 100644 (file)
@@ -321,11 +321,11 @@ public:
                        return "";
                }
                if(cuda_version < 60) {
-                       printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5.\n", cuda_version/10, cuda_version%10);
+                       printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5 or newer.\n", cuda_version/10, cuda_version%10);
                        return "";
                }
-               else if(cuda_version != 75)
-                       printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 is officially supported.\n", cuda_version/10, cuda_version%10);
+               else if(cuda_version != 75 && cuda_version != 80)
+                       printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 and 8.0 are officially supported.\n", cuda_version/10, cuda_version%10);
 
                /* Compile. */
                string kernel = path_join(kernel_path, path_join("kernels", path_join("cuda", "kernel.cu")));
index 1d6fa303d3ef58a06c183ae52a96ea0d48f72f5d..e9eeff31eccbe639d8c894c9eebb4679a29c9332 100644 (file)
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect_array,
-                                            const uint max_hits,
-                                            uint *num_hits)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect_array,
+                                 const uint max_hits,
+                                 uint *num_hits)
 {
        /* todo:
         * - likely and unlikely for if() statements
index 18978efcfa37d285412ed8fe4292231b6e10a6d0..d9623c94b2e35de503f721421721e0a571325a35 100644 (file)
  *
  */
 
-ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            SubsurfaceIntersection *ss_isect,
-                                            int subsurface_object,
-                                            uint *lcg_state,
-                                            int max_hits)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 SubsurfaceIntersection *ss_isect,
+                                 int subsurface_object,
+                                 uint *lcg_state,
+                                 int max_hits)
 {
        /* todo:
         * - test if pushing distance on the stack helps (for non shadow rays)
index 68a11b65ad73fff80aec247bf1a4933f64008bb3..b1a52968a2623d7f99ada8fb04fe9a3e5c026c9f 100644 (file)
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect,
-                                            const uint visibility
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect,
+                                 const uint visibility
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                            , uint *lcg_state,
-                                            float difl,
-                                            float extmax
+                                 , uint *lcg_state,
+                                 float difl,
+                                 float extmax
 #endif
-                                            )
+                                 )
 {
        /* todo:
         * - test if pushing distance on the stack helps (for non shadow rays)
index 03499e943474bcc703ebfd3586fb4b23fe066e1b..107373c17dc14cf8fcd6654e07d84bde83ffd88d 100644 (file)
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect,
-                                            const uint visibility)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect,
+                                 const uint visibility)
 {
        /* todo:
         * - test if pushing distance on the stack helps (for non shadow rays)
index 7eddc2891d021ad4b124007190ca9e41b3b8c971..1f6515c9862422cdb5009127cdc8e63e1c464190 100644 (file)
  *
  */
 
-ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect_array,
-                                            const uint max_hits,
-                                            const uint visibility)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect_array,
+                                 const uint max_hits,
+                                 const uint visibility)
 {
        /* todo:
         * - test if pushing distance on the stack helps (for non shadow rays)
index a251e3bdcf9e92e79e20b91d375ca5f7c47425a8..55bdf3ecbb4724c82480a7addb3a0c091a9381f0 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf)
+ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+                                  ShaderData *sd,
+                                  const ShaderClosure *sc,
+                                  float randu,
+                                  float randv,
+                                  float3 *eval,
+                                  float3 *omega_in,
+                                  differential3 *domega_in,
+                                  float *pdf)
 {
        int label;
 
index 8ed76bea525d1220ac1daac7e065d32d59ff4ce7..9929246ae5cbe4f34101c2ff47d5bc51930650c6 100644 (file)
@@ -62,7 +62,11 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
        return 2.0f / (roughness*roughness) - 2.0f;
 }
 
-ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect(
+        const ShaderClosure *sc,
+        const float3 I,
+        const float3 omega_in,
+        float *pdf)
 {
        const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
        float3 N = bsdf->N;
index afd4a8da62af5c466f817ee626162ad3431840a8..6ebe2f6a7519584519176d07d714a21fa3dae1ab 100644 (file)
  * energy is used. In combination with MIS, that is enough to produce an unbiased result, although
  * the balance heuristic isn't necessarily optimal anymore.
  */
-ccl_device float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi, float3 wo, const bool wo_outside, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint* lcg_state
+ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
+        float3 wi,
+        float3 wo,
+        const bool wo_outside,
+        const float3 color,
+        const float alpha_x,
+        const float alpha_y,
+         ccl_addr_space uint *lcg_state
 #ifdef MF_MULTI_GLASS
-       , const float eta
+        , const float eta
 #elif defined(MF_MULTI_GLOSSY)
-       , float3 *n, float3 *k
+        , float3 *n, float3 *k
 #endif
 )
 {
index a260ae9a31ba52e4cd2d95fd27efef416c081458..35c95768b69325d9be4891a497d5f598c98d12cc 100644 (file)
@@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
 }
 
 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi)
 {
        /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
         * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
  * Returns scaled radius, meaning the result is to be scaled up by d.
  * Since there's no closed form solution we do Newton-Raphson method to find it.
  */
-ccl_device float bssrdf_burley_root_find(float xi)
+ccl_device_inline float bssrdf_burley_root_find(float xi)
 {
        const float tolerance = 1e-6f;
        const int max_iteration_count = 10;
@@ -389,7 +389,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
                bssrdf_burley_sample(sc, xi, r, h);
 }
 
-ccl_device float bssrdf_pdf(const ShaderClosure *sc, float r)
+ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r)
 {
        if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
                return bssrdf_cubic_pdf(sc, r);
index 44734d1b70dd02326aa1ba740ed84f75dccab5b3..b16f0c9a99b66e9bc13ba2708289ff61e1f42b23 100644 (file)
@@ -23,7 +23,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Generic primitive attribute reading functions */
 
-ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+ccl_device_inline float primitive_attribute_float(KernelGlobals *kg,
+                                                  const ShaderData *sd,
+                                                  AttributeElement elem,
+                                                  int offset,
+                                                  float *dx, float *dy)
 {
        if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
                if(subd_triangle_patch(kg, sd) == ~0)
@@ -48,7 +52,12 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *
        }
 }
 
-ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
+                                                    const ShaderData *sd,
+                                                    AttributeElement elem,
+                                                    int offset,
+                                                    float3 *dx,
+                                                    float3 *dy)
 {
        if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
                if(subd_triangle_patch(kg, sd) == ~0)
@@ -75,7 +84,7 @@ ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData
 
 /* Default UV coordinate */
 
-ccl_device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 {
        AttributeElement elem_uv;
        int offset_uv = find_attribute(kg, sd, ATTR_STD_UV, &elem_uv);
@@ -144,7 +153,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 
 /* Motion vector for motion pass */
 
-ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 {
        /* center position */
        float3 center;
index 2044aafc877ac68ab750b4dbec16d5e7631a5024..7c8182bc430b654a6c92a9f6975ada0a81c836d0 100644 (file)
@@ -44,7 +44,9 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
 }
 #endif  /* __KERNEL_GPU__ */
 
-ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P)
+ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
+                                                    const ShaderData *sd,
+                                                    float3 P)
 {
        /* todo: optimize this so it's just a single matrix multiplication when
         * possible (not motion blur), or perhaps even just translation + scale */
index 9ee0b09529e51b4ea91e3f61233a222024568e3d..bfbf73df54f0f1e6169eec7b718691c11ec04ad5 100644 (file)
@@ -18,8 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BAKING__
 
-ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
-                                   int pass_filter, int sample)
+ccl_device_inline void compute_light_pass(KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          PathRadiance *L,
+                                          RNG rng,
+                                          int pass_filter,
+                                          int sample)
 {
        /* initialize master radiance accumulator */
        kernel_assert(kernel_data.film.use_light_pass);
index f6c103d59dd3fc09349eefc7e1c1baddb0e50931..88514de514c9372882170173f2705c46a3a47c57 100644 (file)
@@ -211,7 +211,10 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 /* Panorama Camera */
 
-ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
+ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
+                                              float raster_x, float raster_y,
+                                              float lens_u, float lens_v,
+                                              ccl_addr_space Ray *ray)
 {
        Transform rastertocamera = kernel_data.cam.rastertocamera;
        float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@@ -303,8 +306,12 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 
 /* Common */
 
-ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
-       float lens_u, float lens_v, float time, ccl_addr_space Ray *ray)
+ccl_device_inline void camera_sample(KernelGlobals *kg,
+                                     int x, int y,
+                                     float filter_u, float filter_v,
+                                     float lens_u, float lens_v,
+                                     float time,
+                                     ccl_addr_space Ray *ray)
 {
        /* pixel filter */
        int filter_table_offset = kernel_data.film.filter_table_offset;
index 08f6f4578051546d0f6ec2f2f10bd17826615a91..fb5812ebcb863e4e3c8a5e18fa6872bde3c142d0 100644 (file)
@@ -36,7 +36,7 @@
 /* Qualifier wrappers for different names on different devices */
 
 #define ccl_device  __device__ __inline__
-#define ccl_device_inline  __device__ __inline__
+#define ccl_device_inline  __device__ __forceinline__
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
 #define ccl_constant
index 93c4bd3f7d5df6e8dff8448a067fcdecf4d6c64e..1e829eaa1fad41632da0452daa0fe14d009e09cd 100644 (file)
@@ -44,11 +44,11 @@ typedef struct LightSample {
  *
  * Note: light_p is modified when sample_coord is true.
  */
-ccl_device float area_light_sample(float3 P,
-                                   float3 *light_p,
-                                   float3 axisu, float3 axisv,
-                                   float randu, float randv,
-                                   bool sample_coord)
+ccl_device_inline float area_light_sample(float3 P,
+                                          float3 *light_p,
+                                          float3 axisu, float3 axisv,
+                                          float randu, float randv,
+                                          bool sample_coord)
 {
        /* In our name system we're using P for the center,
         * which is o in the paper.
@@ -268,11 +268,11 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals
        return false;
 }
 
-ccl_device float background_portal_pdf(KernelGlobals *kg,
-                                       float3 P,
-                                       float3 direction,
-                                       int ignore_portal,
-                                       bool *is_possible)
+ccl_device_inline float background_portal_pdf(KernelGlobals *kg,
+                                              float3 P,
+                                              float3 direction,
+                                              int ignore_portal,
+                                              bool *is_possible)
 {
        float portal_pdf = 0.0f;
 
@@ -367,7 +367,10 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
        return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-ccl_device float3 background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+ccl_device_inline float3 background_light_sample(KernelGlobals *kg,
+                                                 float3 P,
+                                                 float randu, float randv,
+                                                 float *pdf)
 {
        /* Probability of sampling portals instead of the map. */
        float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
@@ -507,8 +510,11 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3
        return t*t/cos_pi;
 }
 
-ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
-       float randu, float randv, float3 P, LightSample *ls)
+ccl_device_inline void lamp_light_sample(KernelGlobals *kg,
+                                         int lamp,
+                                         float randu, float randv,
+                                         float3 P,
+                                         LightSample *ls)
 {
        float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
        float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
index d5b310377237c01b05ef18c6a4af0a1ff7b1662c..1f08f3459e69f4b96b76ecbb6956a151abe07f1d 100644 (file)
@@ -436,7 +436,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 #ifdef __SUBSURFACE__
 
-ccl_device bool kernel_path_subsurface_scatter(
+ccl_device_inline bool kernel_path_subsurface_scatter(
         KernelGlobals *kg,
         ShaderData *sd,
         ShaderData *emission_sd,
index 74b1ae0ca3241b4629682495331e5b0efc14b271..250b8e92a45040eecd9feb300dcb5bd15ce32c10 100644 (file)
@@ -222,8 +222,13 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 #endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng,
-       ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray)
+ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
+                                           ccl_addr_space RNG *rng,
+                                           ShaderData *sd,
+                                           ccl_addr_space float3 *throughput,
+                                           ccl_addr_space PathState *state,
+                                           PathRadiance *L,
+                                           ccl_addr_space Ray *ray)
 {
        /* no BSDF? we can stop here */
        if(ccl_fetch(sd, flag) & SD_BSDF) {
index e45522a4641a1a5def1273bab963dd9bb86086df..5fd4f2fad4c19d70d4b658de5c128aa8f4690187 100644 (file)
@@ -18,8 +18,14 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME_SCATTER__
 
-ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-       ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L)
+ccl_device_inline void kernel_path_volume_connect_light(
+        KernelGlobals *kg,
+        RNG *rng,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        float3 throughput,
+        PathState *state,
+        PathRadiance *L)
 {
 #ifdef __EMISSION__
        if(!kernel_data.integrator.use_direct_light)
index 8be6742699a5d7bca434fce2dad34852146ac83e..3437d83ed7d21ddd5889073aaebab42aa1292422 100644 (file)
@@ -130,7 +130,10 @@ ccl_device float2 direction_to_fisheye_equisolid(float3 dir, float lens, float w
        return make_float2(u, v);
 }
 
-ccl_device float3 fisheye_equisolid_to_direction(float u, float v, float lens, float fov, float width, float height)
+ccl_device_inline float3 fisheye_equisolid_to_direction(float u, float v,
+                                                        float lens,
+                                                        float fov,
+                                                        float width, float height)
 {
        u = (u - 0.5f) * width;
        v = (v - 0.5f) * height;
@@ -189,7 +192,7 @@ ccl_device float2 direction_to_mirrorball(float3 dir)
        return make_float2(u, v);
 }
 
-ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
+ccl_device_inline float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
 {
        switch(kernel_data.cam.panorama_type) {
                case PANORAMA_EQUIRECTANGULAR:
@@ -205,7 +208,7 @@ ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
        }
 }
 
-ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
+ccl_device_inline float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
 {
        switch(kernel_data.cam.panorama_type) {
                case PANORAMA_EQUIRECTANGULAR:
@@ -221,9 +224,9 @@ ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
        }
 }
 
-ccl_device float3 spherical_stereo_position(KernelGlobals *kg,
-                                            float3 dir,
-                                            float3 pos)
+ccl_device_inline float3 spherical_stereo_position(KernelGlobals *kg,
+                                                   float3 dir,
+                                                   float3 pos)
 {
        float interocular_offset = kernel_data.cam.interocular_offset;
 
index b7641c37d935d443f25528a4d2dfb430e8fa3d24..bb3fe933b2c11093e363fdc18d992a06cce5c710 100644 (file)
@@ -149,8 +149,11 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 /* ShaderData setup from BSSRDF scatter */
 
 #ifdef __SUBSURFACE__
-ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData *sd,
-       const Intersection *isect, const Ray *ray)
+ccl_device void shader_setup_from_subsurface(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        const Intersection *isect,
+        const Ray *ray)
 {
        bool backfacing = sd->flag & SD_BACKFACING;
 
@@ -226,14 +229,14 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 
 /* ShaderData setup from position sampled on mesh */
 
-ccl_device void shader_setup_from_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         const float3 P,
-                                         const float3 Ng,
-                                         const float3 I,
-                                         int shader, int object, int prim,
-                                         float u, float v, float t,
-                                         float time)
+ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                const float3 P,
+                                                const float3 Ng,
+                                                const float3 I,
+                                                int shader, int object, int prim,
+                                                float u, float v, float t,
+                                                float time)
 {
        /* vectors */
        ccl_fetch(sd, P) = P;
@@ -445,7 +448,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 /* Merging */
 
 #if defined(__BRANCHED_PATH__) || defined(__VOLUME__)
-ccl_device void shader_merge_closures(ShaderData *sd)
+ccl_device_inline void shader_merge_closures(ShaderData *sd)
 {
        /* merge identical closures, better when we sample a single closure at a time */
        for(int i = 0; i < sd->num_closure; i++) {
@@ -554,9 +557,13 @@ ccl_device void shader_bsdf_eval(KernelGlobals *kg,
        }
 }
 
-ccl_device int shader_bsdf_sample(KernelGlobals *kg, ShaderData *sd,
-       float randu, float randv, BsdfEval *bsdf_eval,
-       float3 *omega_in, differential3 *domega_in, float *pdf)
+ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float randu, float randv,
+                                         BsdfEval *bsdf_eval,
+                                         float3 *omega_in,
+                                         differential3 *domega_in,
+                                         float *pdf)
 {
        int sampled = 0;
 
@@ -991,8 +998,12 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
 
 /* Volume Evaluation */
 
-ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
-       PathState *state, VolumeStack *stack, int path_flag, ShaderContext ctx)
+ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          PathState *state,
+                                          VolumeStack *stack,
+                                          int path_flag,
+                                          ShaderContext ctx)
 {
        /* reset closures once at the start, we will be accumulating the closures
         * for all volumes in the stack into a single array of closures */
index 28fa826fde700dd2044063a6608b01e2d9a0d492..61073155e705b10fcfb60ffd7c24bde9ae9ab298 100644 (file)
@@ -85,7 +85,11 @@ ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Sha
        return NULL;
 }
 
-ccl_device float3 subsurface_scatter_eval(ShaderData *sd, ShaderClosure *sc, float disk_r, float r, bool all)
+ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
+                                                 ShaderClosure *sc,
+                                                 float disk_r,
+                                                 float r,
+                                                 bool all)
 {
 #ifdef BSSRDF_MULTI_EVAL
        /* this is the veach one-sample model with balance heuristic, some pdf
@@ -214,7 +218,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 /* Subsurface scattering step, from a point on the surface to other
  * nearby points on the same object.
  */
-ccl_device int subsurface_scatter_multi_intersect(
+ccl_device_inline int subsurface_scatter_multi_intersect(
         KernelGlobals *kg,
         SubsurfaceIntersection* ss_isect,
         ShaderData *sd,
index 01c87e6d89dfa51007a10fe9c51076d3e5377363..9dafed9afd196a9cf1eda5ad54f61a2e2d69ca0c 100644 (file)
@@ -36,7 +36,11 @@ typedef struct VolumeShaderCoefficients {
 } VolumeShaderCoefficients;
 
 /* evaluate shader to get extinction coefficient at P */
-ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, float3 *extinction)
+ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       PathState *state,
+                                                       float3 P,
+                                                       float3 *extinction)
 {
        sd->P = P;
        shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
@@ -58,7 +62,11 @@ ccl_device bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *s
 }
 
 /* evaluate shader to get absorption, scattering and emission at P */
-ccl_device bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, PathState *state, float3 P, VolumeShaderCoefficients *coeff)
+ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            PathState *state,
+                                            float3 P,
+                                            VolumeShaderCoefficients *coeff)
 {
        sd->P = P;
        shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME);
index 6c557684099507c7c48ff50c08ee5e488ef226ba..ff92920c610feb0cf0e930356f0beb64a5b80b7e 100644 (file)
@@ -123,7 +123,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *
        }
 }
 
-ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
        NodeAttributeType type, mesh_type;
        AttributeElement elem;
index bb06254c3a9530e751d683d6d8cccb7e9e927e97..7d512f7ff4ddac213514d1a8bb91fb7af2a6df18 100644 (file)
@@ -18,7 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Geometry Node */
 
-ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint type,
+                                         uint out_offset)
 {
        float3 data;
 
@@ -94,7 +98,11 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 
 /* Particle Info */
 
-ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_particle_info(KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint type,
+                                       uint out_offset)
 {
        switch(type) {
                case NODE_INFO_PAR_INDEX: {
@@ -146,7 +154,11 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float
 
 /* Hair Info */
 
-ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_hair_info(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint type,
+                                   uint out_offset)
 {
        float data;
        float3 data3;
index f959d90f309256b180c2e09c05d90af9e2ee3fc6..368740f64c7dde1b4fbd76900b9962e2904bc06e 100644 (file)
@@ -21,12 +21,12 @@ CCL_NAMESPACE_BEGIN
 
 /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
 
-ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg,
-                                  int offset,
-                                  float f,
-                                  bool interpolate,
-                                  bool extrapolate,
-                                  int table_size)
+ccl_device_inline float4 rgb_ramp_lookup(KernelGlobals *kg,
+                                         int offset,
+                                         float f,
+                                         bool interpolate,
+                                         bool extrapolate,
+                                         int table_size)
 {
        if((f < 0.0f || f > 1.0f) && extrapolate) {
                float4 t0, dy;
index 495d98cf250113f212a552b2f17dfee97ba8fee4..9f2ce1276f9f83a5aaddb99b3d3a8cf0d8e757d5 100644 (file)
@@ -21,11 +21,11 @@ CCL_NAMESPACE_BEGIN
 
 /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
 
-ccl_device float3 rgb_ramp_lookup(const float3 *ramp,
-                                  float f,
-                                  bool interpolate,
-                                  bool extrapolate,
-                                  int table_size)
+ccl_device_inline float3 rgb_ramp_lookup(const float3 *ramp,
+                                         float f,
+                                         bool interpolate,
+                                         bool extrapolate,
+                                         int table_size)
 {
        if ((f < 0.0f || f > 1.0f) && extrapolate) {
                float3 t0, dy;
index 276b6f26f5e72e80fb69df62de5fca7fba04cb58..b39d6a3e009af9a604ed0e9b09c1d1c6544e9be3 100644 (file)
@@ -99,12 +99,12 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
        stack_store_float3(stack, out_offset, data);
 }
 
-ccl_device_inline void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
-                                                  ShaderData *sd,
-                                                  int path_flag,
-                                                  float *stack,
-                                                  uint4 node,
-                                                  int *offset)
+ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int path_flag,
+                                           float *stack,
+                                           uint4 node,
+                                           int *offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
        float3 data;
@@ -184,12 +184,12 @@ ccl_device_inline void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 #endif
 }
 
-ccl_device_inline void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
-                                                  ShaderData *sd,
-                                                  int path_flag,
-                                                  float *stack,
-                                                  uint4 node,
-                                                  int *offset)
+ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int path_flag,
+                                           float *stack,
+                                           uint4 node,
+                                           int *offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
        float3 data;
index 016f4a6a794ccb1c022d4f49e72a53319282ff70..13aba0646d27630849b8655f1226785a0fb86c2d 100644 (file)
@@ -1477,10 +1477,10 @@ ccl_device bool ray_triangle_intersect(
        return true;
 }
 
-ccl_device bool ray_triangle_intersect_uv(
-       float3 ray_P, float3 ray_D, float ray_t,
-       float3 v0, float3 v1, float3 v2,
-       float *isect_u, float *isect_v, float *isect_t)
+ccl_device_inline bool ray_triangle_intersect_uv(
+        float3 ray_P, float3 ray_D, float ray_t,
+        float3 v0, float3 v1, float3 v2,
+        float *isect_u, float *isect_v, float *isect_t)
 {
        /* Calculate intersection */
        float3 e1 = v1 - v0;