Cycles: Delay shooting SSS indirect rays
authorSergey Sharybin <sergey.vfx@gmail.com>
Sun, 22 Nov 2015 10:48:33 +0000 (15:48 +0500)
committerSergey Sharybin <sergey.vfx@gmail.com>
Wed, 25 Nov 2015 08:01:22 +0000 (13:01 +0500)
The idea is to delay shooting indirect rays for the SSS sampling and
trace them after the main integration loop was finished.

This reduces GPU stack usage even further and brings it down to around
652MB (comparing to 722MB before the change and 946MB with previous
stable release).

This also solves the speed regression happened in the previous commit
and now simple SSS scene (SSS suzanne on the floor) renders in 0:50
(comparing to 1:16 with previous commit and 1:03 with official release).

intern/cycles/kernel/kernel_bake.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_branched.h
intern/cycles/kernel/kernel_types.h

index 0cb56461f299def7abfa8cbab506191230d58caf..0f572b34ae016be7fa66a3553dfff4bb9356c4af 100644 (file)
@@ -64,8 +64,19 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
                /* sample subsurface scattering */
                if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
                        /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-                       if(kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
+                       SubsurfaceIndirectRays ss_indirect;
+                       if(kernel_path_subsurface_scatter(kg,
+                                                         sd,
+                                                         &L_sample,
+                                                         &state,
+                                                         &rng,
+                                                         &ray,
+                                                         &throughput,
+                                                         &ss_indirect))
+                       {
+                               kernel_path_subsurface_scatter_indirect(kg, &L_sample, &state, &rng, &ray, &ss_indirect);
                                is_sss_sample = true;
+                       }
                }
 #endif
 
@@ -84,7 +95,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
                                state.ray_t = 0.0f;
 #endif
                                /* compute indirect light */
-                               kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample);
+                               kernel_path_indirect(kg, &rng, &ray, throughput, 1, &state, &L_sample);
 
                                /* sum and reset indirect light pass variables for the next samples */
                                path_radiance_sum_indirect(&L_sample);
index 87d36efa4d4eee527c43e3a585f8c6fd3839dae5..1f385b8d304533be2b54e4a507c583c73f8e5c34 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
-       float3 throughput, int num_samples, PathState state, PathRadiance *L)
+ccl_device void kernel_path_indirect(KernelGlobals *kg,
+                                     RNG *rng,
+                                     Ray *ray,
+                                     float3 throughput,
+                                     int num_samples,
+                                     PathState *state,
+                                     PathRadiance *L)
 {
        /* path iteration */
        for(;;) {
                /* intersect scene */
                Intersection isect;
-               uint visibility = path_state_ray_visibility(kg, &state);
-               bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+               uint visibility = path_state_ray_visibility(kg, state);
+               bool hit = scene_intersect(kg,
+                                          ray,
+                                          visibility,
+                                          &isect,
+                                          NULL,
+                                          0.0f, 0.0f);
 
 #ifdef __LAMP_MIS__
-               if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+               if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
                        /* ray starting from previous non-transparent bounce */
                        Ray light_ray;
 
-                       light_ray.P = ray.P - state.ray_t*ray.D;
-                       state.ray_t += isect.t;
-                       light_ray.D = ray.D;
-                       light_ray.t = state.ray_t;
-                       light_ray.time = ray.time;
-                       light_ray.dD = ray.dD;
-                       light_ray.dP = ray.dP;
+                       light_ray.P = ray->P - state->ray_t*ray->D;
+                       state->ray_t += isect.t;
+                       light_ray.D = ray->D;
+                       light_ray.t = state->ray_t;
+                       light_ray.time = ray->time;
+                       light_ray.dD = ray->dD;
+                       light_ray.dP = ray->dP;
 
                        /* intersect with lamp */
                        float3 emission;
-
-                       if(indirect_lamp_emission(kg, &state, &light_ray, &emission))
-                               path_radiance_accum_emission(L, throughput, emission, state.bounce);
+                       if(indirect_lamp_emission(kg, state, &light_ray, &emission)) {
+                               path_radiance_accum_emission(L,
+                                                            throughput,
+                                                            emission,
+                                                            state->bounce);
+                       }
                }
 #endif
 
 #ifdef __VOLUME__
                /* volume attenuation, emission, scatter */
-               if(state.volume_stack[0].shader != SHADER_NONE) {
-                       Ray volume_ray = ray;
+               if(state->volume_stack[0].shader != SHADER_NONE) {
+                       Ray volume_ray = *ray;
                        volume_ray.t = (hit)? isect.t: FLT_MAX;
 
-                       bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+                       bool heterogeneous =
+                               volume_stack_is_heterogeneous(kg,
+                                                             state->volume_stack);
 
 #ifdef __VOLUME_DECOUPLED__
-                       int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+                       int sampling_method =
+                               volume_stack_sampling_method(kg,
+                                                            state->volume_stack);
                        bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
 
                        if(decoupled) {
@@ -100,15 +117,27 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
                                VolumeSegment volume_segment;
                                ShaderData volume_sd;
 
-                               shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
-                               kernel_volume_decoupled_record(kg, &state,
-                                       &volume_ray, &volume_sd, &volume_segment, heterogeneous);
-                               
+                               shader_setup_from_volume(kg,
+                                                        &volume_sd,
+                                                        &volume_ray,
+                                                        state->bounce,
+                                                        state->transparent_bounce);
+                               kernel_volume_decoupled_record(kg,
+                                                              state,
+                                                              &volume_ray,
+                                                              &volume_sd,
+                                                              &volume_segment,
+                                                              heterogeneous);
+
                                volume_segment.sampling_method = sampling_method;
 
                                /* emission */
-                               if(volume_segment.closure_flag & SD_EMISSION)
-                                       path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+                               if(volume_segment.closure_flag & SD_EMISSION) {
+                                       path_radiance_accum_emission(L,
+                                                                    throughput,
+                                                                    volume_segment.accum_emission,
+                                                                    state->bounce);
+                               }
 
                                /* scattering */
                                VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -117,28 +146,51 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
                                        bool all = kernel_data.integrator.sample_all_lights_indirect;
 
                                        /* direct light sampling */
-                                       kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-                                               throughput, &state, L, all, &volume_ray, &volume_segment);
+                                       kernel_branched_path_volume_connect_light(kg,
+                                                                                 rng,
+                                                                                 &volume_sd,
+                                                                                 throughput,
+                                                                                 state,
+                                                                                 L,
+                                                                                 all,
+                                                                                 &volume_ray,
+                                                                                 &volume_segment);
 
                                        /* indirect sample. if we use distance sampling and take just
                                         * one sample for direct and indirect light, we could share
                                         * this computation, but makes code a bit complex */
-                                       float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
-                                       float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+                                       float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+                                       float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
 
                                        result = kernel_volume_decoupled_scatter(kg,
-                                               &state, &volume_ray, &volume_sd, &throughput,
-                                               rphase, rscatter, &volume_segment, NULL, true);
+                                                                                state,
+                                                                                &volume_ray,
+                                                                                &volume_sd,
+                                                                                &throughput,
+                                                                                rphase,
+                                                                                rscatter,
+                                                                                &volume_segment,
+                                                                                NULL,
+                                                                                true);
                                }
 
                                /* free cached steps */
                                kernel_volume_decoupled_free(kg, &volume_segment);
 
                                if(result == VOLUME_PATH_SCATTERED) {
-                                       if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+                                       if(kernel_path_volume_bounce(kg,
+                                                                    rng,
+                                                                    &volume_sd,
+                                                                    &throughput,
+                                                                    state,
+                                                                    L,
+                                                                    ray))
+                                       {
                                                continue;
-                                       else
+                                       }
+                                       else {
                                                break;
+                                       }
                                }
                                else {
                                        throughput *= volume_segment.accum_transmittance;
@@ -150,18 +202,32 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
                                /* integrate along volume segment with distance sampling */
                                ShaderData volume_sd;
                                VolumeIntegrateResult result = kernel_volume_integrate(
-                                       kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
+                                       kg, state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
 
 #ifdef __VOLUME_SCATTER__
                                if(result == VOLUME_PATH_SCATTERED) {
                                        /* direct lighting */
-                                       kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L);
+                                       kernel_path_volume_connect_light(kg,
+                                                                        rng,
+                                                                        &volume_sd,
+                                                                        throughput,
+                                                                        state,
+                                                                        L);
 
                                        /* indirect light bounce */
-                                       if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+                                       if(kernel_path_volume_bounce(kg,
+                                                                    rng,
+                                                                    &volume_sd,
+                                                                    &throughput,
+                                                                    state,
+                                                                    L,
+                                                                    ray))
+                                       {
                                                continue;
-                                       else
+                                       }
+                                       else {
                                                break;
+                                       }
                                }
 #endif
                        }
@@ -171,8 +237,11 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
                if(!hit) {
 #ifdef __BACKGROUND__
                        /* sample background shader */
-                       float3 L_background = indirect_background(kg, &state, &ray);
-                       path_radiance_accum_background(L, throughput, L_background, state.bounce);
+                       float3 L_background = indirect_background(kg, state, ray);
+                       path_radiance_accum_background(L,
+                                                      throughput,
+                                                      L_background,
+                                                      state->bounce);
 #endif
 
                        break;
@@ -180,9 +249,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 
                /* setup shading */
                ShaderData sd;
-               shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
-               float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
-               shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT);
+               shader_setup_from_ray(kg,
+                                     &sd,
+                                     &isect,
+                                     ray,
+                                     state->bounce,
+                                     state->transparent_bounce);
+               float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
+               shader_eval_surface(kg, &sd, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
                shader_merge_closures(&sd);
 #endif
@@ -190,7 +264,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
                /* blurring of bsdf after bounces, for rays that have a small likelihood
                 * of following this particular path (diffuse, rough glossy) */
                if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-                       float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf;
+                       float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
 
                        if(blur_pdf < 1.0f) {
                                float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@@ -201,21 +275,28 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 #ifdef __EMISSION__
                /* emission */
                if(sd.flag & SD_EMISSION) {
-                       float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-                       path_radiance_accum_emission(L, throughput, emission, state.bounce);
+                       float3 emission = indirect_primitive_emission(kg,
+                                                                     &sd,
+                                                                     isect.t,
+                                                                     state->flag,
+                                                                     state->ray_pdf);
+                       path_radiance_accum_emission(L, throughput, emission, state->bounce);
                }
 #endif
 
                /* path termination. this is a strange place to put the termination, it's
                 * mainly due to the mixed in MIS that we use. gives too many unneeded
                 * shader evaluations, only need emission if we are going to terminate */
-               float probability = path_state_terminate_probability(kg, &state, throughput*num_samples);
+               float probability =
+                       path_state_terminate_probability(kg,
+                                                        state,
+                                                        throughput*num_samples);
 
                if(probability == 0.0f) {
                        break;
                }
                else if(probability != 1.0f) {
-                       float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+                       float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
 
                        if(terminate >= probability)
                                break;
@@ -227,7 +308,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
                /* ambient occlusion */
                if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
                        float bsdf_u, bsdf_v;
-                       path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+                       path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
                        float ao_factor = kernel_data.background.ao_factor;
                        float3 ao_N;
@@ -251,8 +332,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
                                light_ray.dP = sd.dP;
                                light_ray.dD = differential3_zero();
 
-                               if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow))
-                                       path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state.bounce);
+                               if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) {
+                                       path_radiance_accum_ao(L,
+                                                              throughput,
+                                                              ao_alpha,
+                                                              ao_bsdf,
+                                                              ao_shadow,
+                                                              state->bounce);
+                               }
                        }
                }
 #endif
@@ -269,11 +356,21 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 
                        /* do bssrdf scatter step if we picked a bssrdf closure */
                        if(sc) {
-                               uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb);
+                               uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
 
                                float bssrdf_u, bssrdf_v;
-                               path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-                               subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
+                               path_state_rng_2D(kg,
+                                                 rng,
+                                                 state,
+                                                 PRNG_BSDF_U,
+                                                 &bssrdf_u, &bssrdf_v);
+                               subsurface_scatter_step(kg,
+                                                       &sd,
+                                                       state->flag,
+                                                       sc,
+                                                       &lcg_state,
+                                                       bssrdf_u, bssrdf_v,
+                                                       false);
                        }
                }
 #endif
@@ -281,11 +378,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
                if(kernel_data.integrator.use_direct_light) {
                        bool all = kernel_data.integrator.sample_all_lights_indirect;
-                       kernel_branched_path_surface_connect_light(kg, rng, &sd, &state, throughput, 1.0f, L, all);
+                       kernel_branched_path_surface_connect_light(kg,
+                                                                  rng,
+                                                                  &sd,
+                                                                  state,
+                                                                  throughput,
+                                                                  1.0f,
+                                                                  L,
+                                                                  all);
                }
 #endif
 
-               if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
+               if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, state, L, ray))
                        break;
        }
 }
@@ -326,7 +430,15 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 
 #ifdef __SUBSURFACE__
 
-ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
+ccl_device bool kernel_path_subsurface_scatter(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        PathRadiance *L,
+        PathState *state,
+        RNG *rng,
+        Ray *ray,
+        float3 *throughput,
+        SubsurfaceIndirectRays *ss_indirect)
 {
        float bssrdf_probability;
        ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
@@ -349,9 +461,9 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
                                                                  bssrdf_u, bssrdf_v,
                                                                  false);
 #ifdef __VOLUME__
-               Ray volume_ray = *ray;
-               bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                               ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
+               ss_indirect->need_update_volume_stack =
+                       kernel_data.integrator.use_volumes &&
+                       ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
 #endif
 
                /* compute lighting with the BSDF closure */
@@ -367,46 +479,81 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
                                                       sc,
                                                       false);
 
-                       float3 tp = *throughput;
-                       PathState hit_state = *state;
-                       Ray hit_ray = *ray;
+                       PathState *hit_state = &ss_indirect->state;
+                       Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
+                       float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
+
+                       *hit_state = *state;
+                       *hit_ray = *ray;
+                       *hit_tp = *throughput;
 
-                       hit_state.rng_offset += PRNG_BOUNCE_NUM;
+                       hit_state->rng_offset += PRNG_BOUNCE_NUM;
 
-                       kernel_path_surface_connect_light(kg, rng, sd, tp, state, L);
+                       kernel_path_surface_connect_light(kg, rng, sd, *hit_tp, state, L);
 
-                       if(kernel_path_surface_bounce(kg, rng, sd, &tp, &hit_state, L, &hit_ray)) {
+                       if(kernel_path_surface_bounce(kg,
+                                                     rng,
+                                                     sd,
+                                                     hit_tp,
+                                                     hit_state,
+                                                     L,
+                                                     hit_ray))
+                       {
 #ifdef __LAMP_MIS__
-                               hit_state.ray_t = 0.0f;
+                               hit_state->ray_t = 0.0f;
 #endif
+                               ss_indirect->num_rays++;
+                       }
+               }
+               return true;
+       }
+       return false;
+}
+
+/* Trace subsurface indirect rays separately after the path loop, to reduce
+ * GPU stack memory usage. this way ShaderData and other data structures
+ * used during the loop are not needed during kernel_path_indirect.
+ */
+ccl_device void kernel_path_subsurface_scatter_indirect(
+        KernelGlobals *kg,
+        PathRadiance *L,
+        PathState *state,
+        RNG *rng,
+        Ray *ray,
+        SubsurfaceIndirectRays *ss_indirect)
+{
+       for (int i = 0; i < ss_indirect->num_rays; i++) {
+               Ray *indirect_ray = &ss_indirect->rays[i];
+               float3 indirect_throughput = ss_indirect->throughputs[i];
+
+               *state = ss_indirect->state;
 
 #ifdef __VOLUME__
-                               if(need_update_volume_stack) {
-                                       /* Setup ray from previous surface point to the new one. */
-                                       volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
-                                                                    &volume_ray.t);
-
-                                       kernel_volume_stack_update_for_subsurface(
-                                           kg,
-                                           &volume_ray,
-                                           hit_state.volume_stack);
-
-                                       /* Move volume ray forward. */
-                                       volume_ray.P = hit_ray.P;
-                               }
-#endif
+               if(ss_indirect->need_update_volume_stack) {
+                       /* TODO(sergey): Single assignment per scatter. */
+                       Ray volume_ray = *ray;
 
-                               kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L);
+                       /* Setup ray from previous surface point to the new one. */
+                       volume_ray.D = normalize_len(indirect_ray->P - volume_ray.P,
+                                                    &volume_ray.t);
+
+                       kernel_volume_stack_update_for_subsurface(
+                               kg,
+                               &volume_ray,
+                               state->volume_stack);
 
-                               /* for render passes, sum and reset indirect light pass variables
-                                * for the next samples */
-                               path_radiance_sum_indirect(L);
-                               path_radiance_reset_indirect(L);
-                       }
                }
-               return true;
+#endif
+
+               /* Note that this modifies state. */
+               kernel_path_indirect(kg, rng, indirect_ray, indirect_throughput, state->num_samples, state, L);
+
+               /* For render passes, sum and reset indirect light pass variables
+                * for the next samples.
+                */
+               path_radiance_sum_indirect(L);
+               path_radiance_reset_indirect(L);
        }
-       return false;
 }
 #endif
 
@@ -427,6 +574,11 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
        debug_data_init(&debug_data);
 #endif
 
+#ifdef __SUBSURFACE__
+       SubsurfaceIndirectRays ss_indirect;
+       ss_indirect.num_rays = 0;
+#endif
+
        /* path iteration */
        for(;;) {
                /* intersect scene */
@@ -664,8 +816,17 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
                /* bssrdf scatter to a different location on the same object, replacing
                 * the closures with a diffuse BSDF */
                if(sd.flag & SD_BSSRDF) {
-                       if(kernel_path_subsurface_scatter(kg, &sd, &L, &state, rng, &ray, &throughput))
+                       if(kernel_path_subsurface_scatter(kg,
+                                                         &sd,
+                                                         &L,
+                                                         &state,
+                                                         rng,
+                                                         &ray,
+                                                         &throughput,
+                                                         &ss_indirect))
+                       {
                                break;
+                       }
                }
 #endif
 
@@ -677,6 +838,20 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
                        break;
        }
 
+#ifdef __SUBSURFACE__
+       /* Trace indirect subsurface afterwards to reduce GPU stack size.
+        * note that this modifies state.
+        */
+       if (ss_indirect.num_rays) {
+               kernel_path_subsurface_scatter_indirect(kg,
+                                                       &L,
+                                                       &state,
+                                                       rng,
+                                                       &ray,
+                                                       &ss_indirect);
+       }
+#endif
+
        float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
 
        kernel_write_light_passes(kg, buffer, &L, sample);
index b6f95d6b0d26f7b97d4e6722c3531415c731cf32..ee507364b3088226369444c66860ac806c24e47a 100644 (file)
@@ -91,10 +91,27 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
                        float3 tp = throughput;
                        Ray bsdf_ray;
 
-                       if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
+                       if(!kernel_branched_path_surface_bounce(kg,
+                                                               &bsdf_rng,
+                                                               sd,
+                                                               sc,
+                                                               j,
+                                                               num_samples,
+                                                               &tp,
+                                                               &ps,
+                                                               L,
+                                                               &bsdf_ray))
+                       {
                                continue;
+                       }
 
-                       kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
+                       kernel_path_indirect(kg,
+                                            rng,
+                                            &bsdf_ray,
+                                            tp*num_samples_inv,
+                                            num_samples,
+                                            &ps,
+                                            L);
 
                        /* for render passes, sum and reset indirect light pass variables
                         * for the next samples */
@@ -312,12 +329,25 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
                                        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
                                                &ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-                                               
+
                                        (void)result;
                                        kernel_assert(result == VOLUME_PATH_SCATTERED);
 
-                                       if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
-                                               kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+                                       if(kernel_path_volume_bounce(kg,
+                                                                    rng,
+                                                                    &volume_sd,
+                                                                    &tp,
+                                                                    &ps,
+                                                                    &L,
+                                                                    &pray))
+                                       {
+                                               kernel_path_indirect(kg,
+                                                                    rng,
+                                                                    &pray,
+                                                                    tp*num_samples_inv,
+                                                                    num_samples,
+                                                                    &ps,
+                                                                    &L);
 
                                                /* for render passes, sum and reset indirect light pass variables
                                                 * for the next samples */
@@ -353,15 +383,28 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
                                VolumeIntegrateResult result = kernel_volume_integrate(
                                        kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
-                               
+
 #ifdef __VOLUME_SCATTER__
                                if(result == VOLUME_PATH_SCATTERED) {
                                        /* todo: support equiangular, MIS and all light sampling.
                                         * alternatively get decoupled ray marching working on the GPU */
                                        kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
 
-                                       if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
-                                               kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
+                                       if(kernel_path_volume_bounce(kg,
+                                                                    rng,
+                                                                    &volume_sd,
+                                                                    &tp,
+                                                                    &ps,
+                                                                    &L,
+                                                                    &pray))
+                                       {
+                                               kernel_path_indirect(kg,
+                                                                    rng,
+                                                                    &pray,
+                                                                    tp,
+                                                                    num_samples,
+                                                                    &ps,
+                                                                    &L);
 
                                                /* for render passes, sum and reset indirect light pass variables
                                                 * for the next samples */
index e04f500e41069c7fdf1424804dbd09e15fb67ac2..bef00355c5e1b8cc0e8bee0bdd143bbdcf700d94 100644 (file)
@@ -520,18 +520,6 @@ typedef ccl_addr_space struct Intersection {
 #endif
 } Intersection;
 
-/* Subsurface Intersection result */
-
-struct SubsurfaceIntersection
-{
-       Ray ray;
-       float3 weight[BSSRDF_MAX_HITS];
-
-       int num_hits;
-       struct Intersection hits[BSSRDF_MAX_HITS];
-       float3 Ng[BSSRDF_MAX_HITS];
-};
-
 /* Primitives */
 
 typedef enum PrimitiveType {
@@ -764,6 +752,30 @@ typedef struct PathState {
 #endif
 } PathState;
 
+/* Subsurface */
+
+/* Struct to gather multiple SSS hits. */
+struct SubsurfaceIntersection
+{
+       Ray ray;
+       float3 weight[BSSRDF_MAX_HITS];
+
+       int num_hits;
+       Intersection hits[BSSRDF_MAX_HITS];
+       float3 Ng[BSSRDF_MAX_HITS];
+};
+
+/* Struct to gather SSS indirect rays and delay tracing them. */
+struct SubsurfaceIndirectRays
+{
+       bool need_update_volume_stack;
+       PathState state;
+
+       int num_rays;
+       Ray rays[BSSRDF_MAX_HITS];
+       float3 throughputs[BSSRDF_MAX_HITS];
+};
+
 /* Constant Kernel Data
  *
  * These structs are passed from CPU to various devices, and the struct layout