Cycles CUDA: reduce branched path stack memory by sharing indirect ShaderData.
authorBrecht Van Lommel <brechtvanlommel@gmail.com>
Tue, 24 May 2016 20:28:03 +0000 (22:28 +0200)
committerBrecht Van Lommel <brechtvanlommel@gmail.com>
Wed, 25 May 2016 19:13:24 +0000 (21:13 +0200)
Saves about 15% for the branched path kernel.

intern/cycles/kernel/kernel_bake.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_branched.h

index 77982ee548a0d208afd6447bd5e8dd1b099b3f01..3966a06fe330af447cb20a16c4ffc508eca33541 100644 (file)
@@ -30,8 +30,8 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
        Ray ray;
        float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
-       /* emission shader data memory used by various functions */
-       ShaderData emission_sd;
+       /* emission and indirect shader data memory used by various functions */
+       ShaderData emission_sd, indirect_sd;
 
        ray.P = sd->P + sd->Ng;
        ray.D = -sd->Ng;
@@ -94,6 +94,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
                                                                              &L_sample,
                                                                              &throughput);
                                        kernel_path_indirect(kg,
+                                                            &indirect_sd,
                                                             &emission_sd,
                                                             &rng,
                                                             &ray,
@@ -117,7 +118,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
                                state.ray_t = 0.0f;
 #endif
                                /* compute indirect light */
-                               kernel_path_indirect(kg, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample);
+                               kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample);
 
                                /* sum and reset indirect light pass variables for the next samples */
                                path_radiance_sum_indirect(&L_sample);
@@ -144,7 +145,8 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
                /* sample subsurface scattering */
                if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
                        /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-                       kernel_branched_path_subsurface_scatter(kg, sd, &emission_sd, &L_sample, &state, &rng, &ray, throughput);
+                       kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd,
+                               &emission_sd, &L_sample, &state, &rng, &ray, throughput);
                }
 #endif
 
@@ -161,7 +163,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 
                        /* indirect light */
                        kernel_branched_path_surface_indirect_light(kg, &rng,
-                               sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
+                               sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
                }
        }
 #endif
index 5527d8aa86141e129fdf5418f57e268d8bcac8e3..0dded397ffa40ed586710b2197c16c93c544288d 100644 (file)
@@ -53,6 +53,7 @@
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
+                                     ShaderData *sd,
                                      ShaderData *emission_sd,
                                      RNG *rng,
                                      Ray *ray,
@@ -61,9 +62,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      PathState *state,
                                      PathRadiance *L)
 {
-       /* shader data memory used for both volumes and surfaces, saves stack space */
-       ShaderData sd;
-
        /* path iteration */
        for(;;) {
                /* intersect scene */
@@ -121,12 +119,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                VolumeSegment volume_segment;
 
                                shader_setup_from_volume(kg,
-                                                        &sd,
+                                                        sd,
                                                         &volume_ray);
                                kernel_volume_decoupled_record(kg,
                                                               state,
                                                               &volume_ray,
-                                                              &sd,
+                                                              sd,
                                                               &volume_segment,
                                                               heterogeneous);
 
@@ -149,7 +147,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                        /* direct light sampling */
                                        kernel_branched_path_volume_connect_light(kg,
                                                                                  rng,
-                                                                                 &sd,
+                                                                                 sd,
                                                                                  emission_sd,
                                                                                  throughput,
                                                                                  state,
@@ -167,7 +165,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                        result = kernel_volume_decoupled_scatter(kg,
                                                                                 state,
                                                                                 &volume_ray,
-                                                                                &sd,
+                                                                                sd,
                                                                                 &throughput,
                                                                                 rphase,
                                                                                 rscatter,
@@ -182,7 +180,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                if(result == VOLUME_PATH_SCATTERED) {
                                        if(kernel_path_volume_bounce(kg,
                                                                     rng,
-                                                                    &sd,
+                                                                    sd,
                                                                     &throughput,
                                                                     state,
                                                                     L,
@@ -203,14 +201,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                        {
                                /* integrate along volume segment with distance sampling */
                                VolumeIntegrateResult result = kernel_volume_integrate(
-                                       kg, state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
+                                       kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
                                if(result == VOLUME_PATH_SCATTERED) {
                                        /* direct lighting */
                                        kernel_path_volume_connect_light(kg,
                                                                         rng,
-                                                                        &sd,
+                                                                        sd,
                                                                         emission_sd,
                                                                         throughput,
                                                                         state,
@@ -219,7 +217,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                        /* indirect light bounce */
                                        if(kernel_path_volume_bounce(kg,
                                                                     rng,
-                                                                    &sd,
+                                                                    sd,
                                                                     &throughput,
                                                                     state,
                                                                     L,
@@ -251,13 +249,13 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
                /* setup shading */
                shader_setup_from_ray(kg,
-                                     &sd,
+                                     sd,
                                      &isect,
                                      ray);
                float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-               shader_eval_surface(kg, &sd, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
+               shader_eval_surface(kg, sd, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
-               shader_merge_closures(&sd);
+               shader_merge_closures(sd);
 #endif
 
                /* blurring of bsdf after bounces, for rays that have a small likelihood
@@ -267,15 +265,15 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
                        if(blur_pdf < 1.0f) {
                                float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-                               shader_bsdf_blur(kg, &sd, blur_roughness);
+                               shader_bsdf_blur(kg, sd, blur_roughness);
                        }
                }
 
 #ifdef __EMISSION__
                /* emission */
-               if(sd.flag & SD_EMISSION) {
+               if(sd->flag & SD_EMISSION) {
                        float3 emission = indirect_primitive_emission(kg,
-                                                                     &sd,
+                                                                     sd,
                                                                      isect.t,
                                                                      state->flag,
                                                                      state->ray_pdf);
@@ -305,30 +303,30 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
 #ifdef __AO__
                /* ambient occlusion */
-               if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
+               if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
                        float bsdf_u, bsdf_v;
                        path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
                        float ao_factor = kernel_data.background.ao_factor;
                        float3 ao_N;
-                       float3 ao_bsdf = shader_bsdf_ao(kg, &sd, ao_factor, &ao_N);
+                       float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
                        float3 ao_D;
                        float ao_pdf;
                        float3 ao_alpha = make_float3(0.0f, 0.0f, 0.0f);
 
                        sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-                       if(dot(sd.Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+                       if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
                                Ray light_ray;
                                float3 ao_shadow;
 
-                               light_ray.P = ray_offset(sd.P, sd.Ng);
+                               light_ray.P = ray_offset(sd->P, sd->Ng);
                                light_ray.D = ao_D;
                                light_ray.t = kernel_data.background.ao_distance;
 #  ifdef __OBJECT_MOTION__
-                               light_ray.time = sd.time;
+                               light_ray.time = sd->time;
 #  endif
-                               light_ray.dP = sd.dP;
+                               light_ray.dP = sd->dP;
                                light_ray.dD = differential3_zero();
 
                                if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
@@ -346,9 +344,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #ifdef __SUBSURFACE__
                /* bssrdf scatter to a different location on the same object, replacing
                 * the closures with a diffuse BSDF */
-               if(sd.flag & SD_BSSRDF) {
+               if(sd->flag & SD_BSSRDF) {
                        float bssrdf_probability;
-                       ShaderClosure *sc = subsurface_scatter_pick_closure(kg, &sd, &bssrdf_probability);
+                       ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
 
                        /* modify throughput for picking bssrdf or bsdf */
                        throughput *= bssrdf_probability;
@@ -364,7 +362,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                                  PRNG_BSDF_U,
                                                  &bssrdf_u, &bssrdf_v);
                                subsurface_scatter_step(kg,
-                                                       &sd,
+                                                       sd,
                                                        state,
                                                        state->flag,
                                                        sc,
@@ -380,7 +378,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                        int all = kernel_data.integrator.sample_all_lights_indirect;
                        kernel_branched_path_surface_connect_light(kg,
                                                                   rng,
-                                                                  &sd,
+                                                                  sd,
                                                                   emission_sd,
                                                                   state,
                                                                   throughput,
@@ -390,7 +388,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
                }
 #endif
 
-               if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, state, L, ray))
+               if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
                        break;
        }
 }
index b4dee220aa5a6b84a802999649adce52b8c4647d..fdba1a7b02518274ac65409d6accd99af103ab91 100644 (file)
@@ -64,8 +64,8 @@ ccl_device void kernel_branched_path_ao(KernelGlobals *kg,
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-       RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput,
-       float num_samples_adjust, PathState *state, PathRadiance *L)
+       RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
+       float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
        for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
                const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
@@ -112,6 +112,7 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
                        }
 
                        kernel_path_indirect(kg,
+                                                        indirect_sd,
                                             emission_sd,
                                             rng,
                                             &bsdf_ray,
@@ -131,6 +132,7 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 #ifdef __SUBSURFACE__
 ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                         ShaderData *sd,
+                                                        ShaderData *indirect_sd,
                                                         ShaderData *emission_sd,
                                                         PathRadiance *L,
                                                         PathState *state,
@@ -222,6 +224,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                        kg,
                                        rng,
                                        &bssrdf_sd,
+                                               indirect_sd,
                                        emission_sd,
                                        throughput,
                                        num_samples_inv,
@@ -244,8 +247,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
        /* shader data memory used for both volumes and surfaces, saves stack space */
        ShaderData sd;
-       /* shader data used by emission, shadows, volume stacks */
-       ShaderData emission_sd;
+       /* shader data used by emission, shadows, volume stacks, indirect path */
+       ShaderData emission_sd, indirect_sd;
 
        PathState state;
        path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
@@ -356,6 +359,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
                                                                     &pray))
                                        {
                                                kernel_path_indirect(kg,
+                                                                    &indirect_sd,
                                                                     &emission_sd,
                                                                     rng,
                                                                     &pray,
@@ -413,6 +417,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
                                                                     &pray))
                                        {
                                                kernel_path_indirect(kg,
+                                                                    &indirect_sd,
                                                                     &emission_sd,
                                                                     rng,
                                                                     &pray,
@@ -522,8 +527,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __SUBSURFACE__
                /* bssrdf scatter to a different location on the same object */
                if(sd.flag & SD_BSSRDF) {
-                       kernel_branched_path_subsurface_scatter(kg, &sd, &emission_sd, &L, &state,
-                                                               rng, &ray, throughput);
+                       kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
+                                                               &L, &state, rng, &ray, throughput);
                }
 #endif
 
@@ -541,7 +546,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
                        /* indirect light */
                        kernel_branched_path_surface_indirect_light(kg, rng,
-                               &sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+                               &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
 
                        /* continue in case of transparency */
                        throughput *= shader_bsdf_transparency(kg, &sd);