Cycles: Replace __MAX_CLOSURE__ build option with runtime integrator variable
authorMai Lavelle <mai.lavelle@gmail.com>
Thu, 9 Nov 2017 05:49:15 +0000 (00:49 -0500)
committerMai Lavelle <mai.lavelle@gmail.com>
Thu, 9 Nov 2017 06:04:06 +0000 (01:04 -0500)
Goal is to reduce OpenCL kernel recompilations.

Currently viewport renders are still set to use 64 closures as this seems to
be faster and we don't want to cause a performance regression there. Needs
to be investigated.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D2775

28 files changed:
intern/cycles/device/device.cpp
intern/cycles/device/device.h
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_split_kernel.cpp
intern/cycles/device/device_split_kernel.h
intern/cycles/kernel/kernel_bake.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_branched.h
intern/cycles/kernel/kernel_subsurface.h
intern/cycles/kernel/kernel_types.h
intern/cycles/kernel/kernel_volume.h
intern/cycles/kernel/split/kernel_branched.h
intern/cycles/kernel/split/kernel_direct_lighting.h
intern/cycles/kernel/split/kernel_do_volume.h
intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
intern/cycles/kernel/split/kernel_indirect_background.h
intern/cycles/kernel/split/kernel_lamp_emission.h
intern/cycles/kernel/split/kernel_next_iteration_setup.h
intern/cycles/kernel/split/kernel_shader_eval.h
intern/cycles/kernel/split/kernel_shader_setup.h
intern/cycles/kernel/split/kernel_shader_sort.h
intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
intern/cycles/kernel/split/kernel_split_data.h
intern/cycles/kernel/split/kernel_split_data_types.h
intern/cycles/kernel/split/kernel_subsurface_scatter.h
intern/cycles/render/session.cpp

index 641e3fde1405e5847fd157db838db91f4da8e1ee..a3ff5481cefb39253b878f622601e5fe20dfb9b5 100644 (file)
@@ -47,7 +47,6 @@ std::ostream& operator <<(std::ostream &os,
 {
        os << "Experimental features: "
           << (requested_features.experimental ? "On" : "Off") << std::endl;
-       os << "Max closure count: " << requested_features.max_closure << std::endl;
        os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
        /* TODO(sergey): Decode bitflag into list of names. */
        os << "Nodes features: " << requested_features.nodes_features << std::endl;
index 6bf3bbe6c257d17f08cad9e9632ae1e5d2c93cb6..35b545388f2b6fdfa7d3a698d3d577d2d05d1eda 100644 (file)
@@ -91,9 +91,6 @@ public:
        /* Use experimental feature set. */
        bool experimental;
 
-       /* Maximum number of closures in shader trees. */
-       int max_closure;
-
        /* Selective nodes compilation. */
 
        /* Identifier of a node group up to which all the nodes needs to be
@@ -146,7 +143,6 @@ public:
        {
                /* TODO(sergey): Find more meaningful defaults. */
                experimental = false;
-               max_closure = 0;
                max_nodes_group = 0;
                nodes_features = 0;
                use_hair = false;
@@ -167,7 +163,6 @@ public:
        bool modified(const DeviceRequestedFeatures& requested_features)
        {
                return !(experimental == requested_features.experimental &&
-                        max_closure == requested_features.max_closure &&
                         max_nodes_group == requested_features.max_nodes_group &&
                         nodes_features == requested_features.nodes_features &&
                         use_hair == requested_features.use_hair &&
@@ -198,7 +193,6 @@ public:
                        string_printf("%d", max_nodes_group);
                build_options += " -D__NODES_FEATURES__=" +
                        string_printf("%d", nodes_features);
-               build_options += string_printf(" -D__MAX_CLOSURE__=%d", max_closure);
                if(!use_hair) {
                        build_options += " -D__NO_HAIR__";
                }
index 1a54c3380ee5aabb346aecde1262242a5e86912b..0f4001ab1a67b984dcdebe9d188e2ce245ab74d4 100644 (file)
@@ -760,7 +760,6 @@ public:
                CPUSplitKernel *split_kernel = NULL;
                if(use_split_kernel) {
                        split_kernel = new CPUSplitKernel(this);
-                       requested_features.max_closure = MAX_CLOSURE;
                        if(!split_kernel->load_kernels(requested_features)) {
                                thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
                                kgbuffer.free();
index 14e3ddc8c7b7d04df3acce3b21028f067488c71e..d230a0c565d9df1e3c3564f611285e136d9ee383 100644 (file)
@@ -1861,10 +1861,6 @@ public:
 
                        DeviceRequestedFeatures requested_features;
                        if(use_split_kernel()) {
-                               if(!use_adaptive_compilation()) {
-                                       requested_features.max_closure = 64;
-                               }
-
                                if(split_kernel == NULL) {
                                        split_kernel = new CUDASplitKernel(this);
                                        split_kernel->load_kernels(requested_features);
index 115273d9f0a2024094827d19630b11dae2f549d9..566d4020b335ed3472690b7e254efab2f602a5fd 100644 (file)
@@ -34,7 +34,6 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device)
   use_queues_flag(device, "use_queues_flag"),
   work_pool_wgs(device, "work_pool_wgs")
 {
-       current_max_closure = -1;
        first_tile = true;
 
        avg_time_per_sample = 0.0;
@@ -116,8 +115,6 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 
 #undef LOAD_KERNEL
 
-       current_max_closure = requested_features.max_closure;
-
        return true;
 }
 
index 0647c66444749ea10f12d35ca5ff84d2dc736cb2..2ec0261e84709463c257f39d23c67b9bb83efa71 100644 (file)
@@ -92,9 +92,6 @@ private:
        /* Work pool with respect to each work group. */
        device_only_memory<unsigned int> work_pool_wgs;
 
-       /* clos_max value for which the kernels have been loaded currently. */
-       int current_max_closure;
-
        /* Marked True in constructor and marked false at the end of path_trace(). */
        bool first_tile;
 
index 73cddeb27f75b9f82e48b50ac8e8caabcc100e9a..8788e89c40e14bb753251a50245b421b760b1b89 100644 (file)
@@ -51,7 +51,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
        path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
 
        /* evaluate surface shader */
-       shader_eval_surface(kg, sd, &state, state.flag, MAX_CLOSURE);
+       shader_eval_surface(kg, sd, &state, state.flag, kernel_data.integrator.max_closures);
 
        /* TODO, disable more closures we don't need besides transparent */
        shader_bsdf_disable_transparency(kg, sd);
@@ -228,12 +228,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
                }
                else {
                        /* surface color of the pass only */
-                       shader_eval_surface(kg, sd, state, 0, MAX_CLOSURE);
+                       shader_eval_surface(kg, sd, state, 0, kernel_data.integrator.max_closures);
                        return kernel_bake_shader_bsdf(kg, sd, type);
                }
        }
        else {
-               shader_eval_surface(kg, sd, state, 0, MAX_CLOSURE);
+               shader_eval_surface(kg, sd, state, 0, kernel_data.integrator.max_closures);
                color = kernel_bake_shader_bsdf(kg, sd, type);
        }
 
@@ -333,7 +333,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
                {
                        float3 N = sd.N;
                        if((sd.flag & SD_HAS_BUMP)) {
-                               shader_eval_surface(kg, &sd, &state, 0, MAX_CLOSURE);
+                               shader_eval_surface(kg, &sd, &state, 0, kernel_data.integrator.max_closures);
                                N = shader_bsdf_average_normal(kg, &sd);
                        }
 
index 8519e0682e176c3857e890da141d48b0b28d75f2..207ba741e6f47ac6ac6853bf50e7eef7f2b868f0 100644 (file)
@@ -443,7 +443,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      sd,
                                      &isect,
                                      ray);
-               shader_eval_surface(kg, sd, state, state->flag, MAX_CLOSURE);
+               shader_eval_surface(kg, sd, state, state->flag, kernel_data.integrator.max_closures);
                shader_prepare_closures(sd, state);
 
                /* Apply shadow catcher, holdout, emission. */
@@ -594,7 +594,7 @@ ccl_device_forceinline void kernel_path_integrate(
 
                /* Setup and evaluate shader. */
                shader_setup_from_ray(kg, &sd, &isect, ray);
-               shader_eval_surface(kg, &sd, state, state->flag, MAX_CLOSURE);
+               shader_eval_surface(kg, &sd, state, state->flag, kernel_data.integrator.max_closures);
                shader_prepare_closures(&sd, state);
 
                /* Apply shadow catcher, holdout, emission. */
index b37bc65f4dfbdbd0018c6b0df598fda43e37a686..9996f52f9a4cea76efecc671c6e9e94ec47a2d1d 100644 (file)
@@ -474,7 +474,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
 
                /* Setup and evaluate shader. */
                shader_setup_from_ray(kg, &sd, &isect, &ray);
-               shader_eval_surface(kg, &sd, &state, state.flag, MAX_CLOSURE);
+               shader_eval_surface(kg, &sd, &state, state.flag, kernel_data.integrator.max_closures);
                shader_merge_closures(&sd);
 
                /* Apply shadow catcher, holdout, emission. */
index 87e7d7ff3980e193a64e146982388de0daec6c5c..616ad71af3cf3777b461289617fc5a4ed3ccaaa0 100644 (file)
@@ -76,11 +76,11 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
 }
 
 /* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N)
 {
        sd->flag &= ~SD_CLOSURE_FLAGS;
        sd->num_closure = 0;
-       sd->num_closure_left = MAX_CLOSURE;
+       sd->num_closure_left = kernel_data.integrator.max_closures;
 
        if(hit) {
                Bssrdf *bssrdf = (Bssrdf *)sc;
@@ -154,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 
        if(bump || texture_blur > 0.0f) {
                /* average color and normal at incoming point */
-               shader_eval_surface(kg, sd, state, state_flag, MAX_CLOSURE);
+               shader_eval_surface(kg, sd, state, state_flag, kernel_data.integrator.max_closures);
                float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
 
                /* we simply divide out the average color and multiply with the average
@@ -342,7 +342,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
        subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N);
 
        /* Setup diffuse BSDF. */
-       subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
+       subsurface_scatter_setup_diffuse_bsdf(kg, sd, sc, weight, true, N);
 }
 
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
@@ -439,7 +439,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a
        subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N);
 
        /* setup diffuse bsdf */
-       subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
+       subsurface_scatter_setup_diffuse_bsdf(kg, sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
 
 CCL_NAMESPACE_END
index fc3e7b3da98ab8dbb051b7e39139785cc227655e..919dafbc780478bd72f554b24d12b5fa5a1cd35c 100644 (file)
@@ -782,10 +782,14 @@ typedef struct AttributeDescriptor {
 /* Closure data */
 
 #ifdef __MULTI_CLOSURE__
-#  ifndef __MAX_CLOSURE__
-#     define MAX_CLOSURE 64
+#  ifdef __SPLIT_KERNEL__
+#    define MAX_CLOSURE 1
 #  else
-#    define MAX_CLOSURE __MAX_CLOSURE__
+#    ifndef __MAX_CLOSURE__
+#       define MAX_CLOSURE 64
+#    else
+#      define MAX_CLOSURE __MAX_CLOSURE__
+#    endif
 #  endif
 #else
 #  define MAX_CLOSURE 1
@@ -1313,7 +1317,8 @@ typedef struct KernelIntegrator {
        int volume_samples;
 
        int start_sample;
-       int pad;
+
+       int max_closures;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
index fb3c5437275167942729524b34cf3cdaefe53186..5604d8e516370b68a5d785d45a08f39bbe85c2e5 100644 (file)
@@ -62,7 +62,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
                                             VolumeShaderCoefficients *coeff)
 {
        sd->P = P;
-       shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, MAX_CLOSURE);
+       shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, kernel_data.integrator.max_closures);
 
        if(!(sd->flag & (SD_EXTINCTION|SD_SCATTER|SD_EMISSION)))
                return false;
index 2313feac0894c72ee47b07abf1e6f4c25f754655..6456636caaad28d9c7f6aeea68bd0ae0b3cd1594 100644 (file)
@@ -30,10 +30,14 @@ ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGloba
        BRANCHED_STORE(path_state);
        BRANCHED_STORE(throughput);
        BRANCHED_STORE(ray);
-       BRANCHED_STORE(sd);
        BRANCHED_STORE(isect);
        BRANCHED_STORE(ray_state);
 
+       branched_state->sd = *kernel_split_sd(sd, ray_index);
+       for(int i = 0; i < branched_state->sd.num_closure; i++) {
+               branched_state->sd.closure[i] = kernel_split_sd(sd, ray_index)->closure[i];
+       }
+
 #undef BRANCHED_STORE
 
        /* set loop counters to intial position */
@@ -53,10 +57,14 @@ ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobal
        BRANCHED_RESTORE(path_state);
        BRANCHED_RESTORE(throughput);
        BRANCHED_RESTORE(ray);
-       BRANCHED_RESTORE(sd);
        BRANCHED_RESTORE(isect);
        BRANCHED_RESTORE(ray_state);
 
+       *kernel_split_sd(sd, ray_index) = branched_state->sd;
+       for(int i = 0; i < branched_state->sd.num_closure; i++) {
+               kernel_split_sd(sd, ray_index)->closure[i] = branched_state->sd.closure[i];
+       }
+
 #undef BRANCHED_RESTORE
 
        /* leave indirect loop */
index 832b0e5b2652e035f7028c91d45c0e3680a8a898..ca79602c565355cca2559a8d854a79aff0de3d68 100644 (file)
@@ -58,7 +58,7 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 
        if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-               ShaderData *sd = &kernel_split_state.sd[ray_index];
+               ShaderData *sd = kernel_split_sd(sd, ray_index);
 
                /* direct lighting */
 #ifdef __EMISSION__
index 02881da6c04df8f96b3f6a4b010a50c79514fb54..fb5bd3d48dd3c278d3f6dbb254ae9b350931e036 100644 (file)
@@ -29,7 +29,7 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
 {
        SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
 
-       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *sd = kernel_split_sd(sd, ray_index);
        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
        ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
 
@@ -140,7 +140,7 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
                ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
                ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
                ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-               ShaderData *sd = &kernel_split_state.sd[ray_index];
+               ShaderData *sd = kernel_split_sd(sd, ray_index);
                ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
 
                bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
index bc8ca3aa3cae88d918fe6738fe448c6f9e36384e..88919f47c7a1d1b0d56306d12b1bf8021e1f3f19 100644 (file)
@@ -94,7 +94,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
        float3 throughput;
 
        ccl_global char *ray_state = kernel_split_state.ray_state;
-       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *sd = kernel_split_sd(sd, ray_index);
 
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
index 0c894bd3d71486b889abca996e0198d97cd6e25d..4cf88a02590b207129c3c1b77ec62fb211b3b5fa 100644 (file)
@@ -55,7 +55,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
                PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
                ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
                float3 throughput = kernel_split_state.throughput[ray_index];
-               ShaderData *sd = &kernel_split_state.sd[ray_index];
+               ShaderData *sd = kernel_split_sd(sd, ray_index);
 
                kernel_path_background(kg, state, ray, throughput, sd, L);
                kernel_split_path_end(kg, ray_index);
index d5099ac66e664ffc8c0e0cc7bac47954a68d0e2b..c14f66f664f81583df9fd76f7ad3dc0a5468ed78 100644 (file)
@@ -58,7 +58,7 @@ ccl_device void kernel_lamp_emission(KernelGlobals *kg)
                float3 throughput = kernel_split_state.throughput[ray_index];
                Ray ray = kernel_split_state.ray[ray_index];
                ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-               ShaderData *sd = &kernel_split_state.sd[ray_index];
+               ShaderData *sd = kernel_split_sd(sd, ray_index);
 
                kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L);
        }
index c3373174582acae1f6deaa1d042c2707d36ec4be..bb6bf1cc7e6c8beb619a8fca7b74574d1c4f2b4c 100644 (file)
@@ -58,7 +58,7 @@ ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int
        kernel_split_branched_path_indirect_loop_end(kg, ray_index);
 
        ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *sd = kernel_split_sd(sd, ray_index);
        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
        ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 
@@ -126,7 +126,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
        if(active) {
                ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
                ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-               ShaderData *sd = &kernel_split_state.sd[ray_index];
+               ShaderData *sd = kernel_split_sd(sd, ray_index);
                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
                PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 
index 22602537524d70ef241294220c56a720c641fd7b..2409d1ba28b966f9a89bba1eabd643939d41af3f 100644 (file)
@@ -50,15 +50,15 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-               shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag, MAX_CLOSURE);
+               shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, state->flag, kernel_data.integrator.max_closures);
 #ifdef __BRANCHED_PATH__
                if(kernel_data.integrator.branched) {
-                       shader_merge_closures(&kernel_split_state.sd[ray_index]);
+                       shader_merge_closures(kernel_split_sd(sd, ray_index));
                }
                else
 #endif
                {
-                       shader_prepare_closures(&kernel_split_state.sd[ray_index], state);
+                       shader_prepare_closures(kernel_split_sd(sd, ray_index), state);
                }
        }
 }
index 0432689d9fa40fd6391945fa2701ebef2578a616..9d428ee813990a4421e43da6fb07d4015f489cd4 100644 (file)
@@ -61,7 +61,7 @@ ccl_device void kernel_shader_setup(KernelGlobals *kg,
                Ray ray = kernel_split_state.ray[ray_index];
 
                shader_setup_from_ray(kg,
-                                     &kernel_split_state.sd[ray_index],
+                                     kernel_split_sd(sd, ray_index),
                                      &isect,
                                      &ray);
        }
index 5a55b680695463b52c5ddac59fe57d559e1cb8e7..2132c42220fc47b17e80285d488e0c5aee4707e5 100644 (file)
@@ -47,7 +47,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
                        int ray_index = kernel_split_state.queue_data[add];
                        bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
                        if(valid) {
-                               value = kernel_split_state.sd[ray_index].shader & SHADER_MASK;
+                               value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK;
                        }
                }
                local_value[i + lid] = value;
index b50de615fc8bbcb026d233f7e738099e86fb978c..a4cffd77eff94d09d57f55b2248fce621f24f874 100644 (file)
@@ -33,7 +33,7 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
                return;
        }
 
-       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *sd = kernel_split_sd(sd, ray_index);
        ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
index 9a6bdfbdffef55f2ebe8efd0b9aaabf7f4f24ce2..da072fd5f1a150ee9e3ba4cb41bbd2d312fac91b 100644 (file)
@@ -43,7 +43,7 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
        Ray ray = kernel_split_state.light_ray[ray_index];
        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *sd = kernel_split_sd(sd, ray_index);
        float3 throughput = kernel_split_state.throughput[ray_index];
 
        BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
index eac22050a389daf38ea91ae233773d40c9327c0c..fa2f0b20a835db54fb269740c4e7c346fbfe62cc 100644 (file)
@@ -31,6 +31,14 @@ ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_
        size = size SPLIT_DATA_ENTRIES;
 #undef SPLIT_DATA_ENTRY
 
+       uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures-1);
+
+#ifdef __BRANCHED_PATH__
+       size += align_up(closure_size * num_elements, 16);
+#endif
+
+       size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
+
        return size;
 }
 
@@ -49,6 +57,15 @@ ccl_device_inline void split_data_init(KernelGlobals *kg,
        SPLIT_DATA_ENTRIES;
 #undef SPLIT_DATA_ENTRY
 
+       uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures-1);
+
+#ifdef __BRANCHED_PATH__
+       p += align_up(closure_size * num_elements, 16);
+#endif
+
+       split_data->_sd = (ShaderData*)p;
+       p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
+
        split_data->ray_state = ray_state;
 }
 
index 5c2aadcf4ec6ab3182c8099c2dc51923511a3a15..9ac3f904819d2ee3808bbb6046ea1426772c56be 100644 (file)
@@ -51,7 +51,6 @@ typedef ccl_global struct SplitBranchedState {
        float3 throughput;
        Ray ray;
 
-       struct ShaderData sd;
        Intersection isect;
 
        char ray_state;
@@ -77,6 +76,9 @@ typedef ccl_global struct SplitBranchedState {
        int shared_sample_count; /* number of branched samples shared with other threads */
        int original_ray; /* index of original ray when sharing branched samples */
        bool waiting_on_shared_samples;
+
+       /* Must be last in to allow for dynamic size of closures */
+       struct ShaderData sd;
 } SplitBranchedState;
 
 #define SPLIT_DATA_BRANCHED_ENTRIES \
@@ -110,11 +112,11 @@ typedef ccl_global struct SplitBranchedState {
        SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
        SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
        SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
-       SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
        SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
        SPLIT_DATA_SUBSURFACE_ENTRIES \
        SPLIT_DATA_VOLUME_ENTRIES \
        SPLIT_DATA_BRANCHED_ENTRIES \
+       SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
 
 /* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
 #define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
@@ -126,11 +128,11 @@ typedef ccl_global struct SplitBranchedState {
        SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
        SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
        SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-       SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
        SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
        SPLIT_DATA_SUBSURFACE_ENTRIES \
        SPLIT_DATA_VOLUME_ENTRIES \
        SPLIT_DATA_BRANCHED_ENTRIES \
+       SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
 
 /* struct that holds pointers to data in the shared state buffer */
 typedef struct SplitData {
@@ -154,6 +156,12 @@ __device__ SplitParams __split_param_data;
 #  define kernel_split_params (__split_param_data)
 #endif  /* __KERNEL_CUDA__ */
 
+#define kernel_split_sd(sd, ray_index) ((ShaderData*) \
+       ( \
+               ((ccl_global char*)kernel_split_state._##sd) + \
+               (sizeof(ShaderData) + sizeof(ShaderClosure)*(kernel_data.integrator.max_closures-1)) * (ray_index) \
+       ))
+
 /* Local storage for queue_enqueue kernel. */
 typedef struct QueueEnqueueLocals {
        uint queue_atomics[2];
index c5504d0a89ba8dfacec7c8d323521797414a4dd5..887c3e313d1fb6ea0f8ca68e26fa95c7b36752c7 100644 (file)
@@ -98,7 +98,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 
                        /* compute lighting with the BSDF closure */
                        for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
-                               ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+                               ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index);
                                *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
                                                   * important as the indirect path will write into bssrdf_sd */
 
@@ -228,7 +228,7 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
                ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
                ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
                ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-               ShaderData *sd = &kernel_split_state.sd[ray_index];
+               ShaderData *sd = kernel_split_sd(sd, ray_index);
                ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
 
                if(sd->flag & SD_BSSRDF) {
index 74cfd02e1a46359000884554e188911f2eb7b2e1..e8d9558c38df25c61b12ab394b1de50161840464 100644 (file)
@@ -642,13 +642,11 @@ DeviceRequestedFeatures Session::get_requested_device_features()
        DeviceRequestedFeatures requested_features;
        requested_features.experimental = params.experimental;
 
-       requested_features.max_closure = get_max_closure_count();
        scene->shader_manager->get_requested_features(
                scene,
                &requested_features);
        if(!params.background) {
                /* Avoid too much re-compilations for viewport render. */
-               requested_features.max_closure = 64;
                requested_features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
                requested_features.nodes_features = NODE_FEATURE_ALL;
        }
@@ -858,6 +856,16 @@ void Session::update_scene()
        if(scene->need_update()) {
                load_kernels(false);
 
+               /* Update max_closures. */
+               KernelIntegrator *kintegrator = &scene->dscene.data.integrator;
+               if(params.background) {
+                       kintegrator->max_closures = get_max_closure_count();
+               }
+               else {
+                       /* Currently viewport render is faster with higher max_closures, needs investigating. */
+                       kintegrator->max_closures = 64;
+               }
+
                progress.set_status("Updating Scene");
                MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
        }