Merge branch 'master' into blender2.8
[blender.git] / intern / cycles / kernel / split / kernel_subsurface_scatter.h
index 0b4d50c70eea713521ed0c2f5e1c1fb0047027b5..887c3e313d1fb6ea0f8ca68e26fa95c7b36752c7 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
 
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
-                                          ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
 {
-#ifdef __SUBSURFACE__
-       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               *local_queue_atomics = 0;
+       kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       branched_state->ss_next_closure = 0;
+       branched_state->ss_next_sample = 0;
+
+       branched_state->num_hits = 0;
+       branched_state->next_hit = 0;
+
+       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       ShaderData *sd = &branched_state->sd;
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+
+       for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+               ShaderClosure *sc = &sd->closure[i];
+
+               if(!CLOSURE_IS_BSSRDF(sc->type))
+                       continue;
+
+               /* set up random number generator */
+               if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+                  branched_state->next_closure == 0 && branched_state->next_sample == 0)
+               {
+                       branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
+                                                                            0x68bc21eb);
+               }
+               int num_samples = kernel_data.integrator.subsurface_samples;
+               float num_samples_inv = 1.0f/num_samples;
+               uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
+               /* do subsurface scatter step with copy of shader data, this will
+                * replace the BSSRDF with a diffuse BSDF closure */
+               for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+                       ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect;
+                       float bssrdf_u, bssrdf_v;
+                       path_branched_rng_2D(kg,
+                                            bssrdf_rng_hash,
+                                            &branched_state->path_state,
+                                            j,
+                                            num_samples,
+                                            PRNG_BSDF_U,
+                                            &bssrdf_u,
+                                            &bssrdf_v);
+
+                       /* intersection is expensive so avoid doing multiple times for the same input */
+                       if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                               uint lcg_state = branched_state->lcg_state;
+                               LocalIntersection ss_isect_private;
+
+                               branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+                                                                                             &ss_isect_private,
+                                                                                             sd,
+                                                                                             sc,
+                                                                                             &lcg_state,
+                                                                                             bssrdf_u, bssrdf_v,
+                                                                                             true);
+
+                               branched_state->lcg_state = lcg_state;
+                               *ss_isect = ss_isect_private;
+                       }
+
+#ifdef __VOLUME__
+                       Ray volume_ray = branched_state->ray;
+                       bool need_update_volume_stack =
+                               kernel_data.integrator.use_volumes &&
+                               sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+                       /* compute lighting with the BSDF closure */
+                       for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+                               ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index);
+                               *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+                                                  * important as the indirect path will write into bssrdf_sd */
+
+                               LocalIntersection ss_isect_private = *ss_isect;
+                               subsurface_scatter_multi_setup(kg,
+                                                              &ss_isect_private,
+                                                              hit,
+                                                              bssrdf_sd,
+                                                              &branched_state->path_state,
+                                                              branched_state->path_state.flag,
+                                                              sc,
+                                                              true);
+                               *ss_isect = ss_isect_private;
+
+                               ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+                               *hit_state = branched_state->path_state;
+
+                               path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+                               if(need_update_volume_stack) {
+                                       /* Setup ray from previous surface point to the new one. */
+                                       float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+                                       volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+                                       /* this next part is expensive as it does scene intersection so only do once */
+                                       if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                                               for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+                                                       branched_state->volume_stack[k] = hit_state->volume_stack[k];
+                                               }
+
+                                               kernel_volume_stack_update_for_subsurface(kg,
+                                                                                         emission_sd,
+                                                                                         &volume_ray,
+                                                                                         branched_state->volume_stack);
+                                       }
+
+                                       for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+                                               hit_state->volume_stack[k] = branched_state->volume_stack[k];
+                                       }
+                               }
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+                               if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                                       /* direct light */
+                                       if(kernel_data.integrator.use_direct_light) {
+                                               int all = (kernel_data.integrator.sample_all_lights_direct) ||
+                                                             (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+                                               kernel_branched_path_surface_connect_light(kg,
+                                                                                          bssrdf_sd,
+                                                                                          emission_sd,
+                                                                                          hit_state,
+                                                                                          branched_state->throughput,
+                                                                                          num_samples_inv,
+                                                                                          L,
+                                                                                          all);
+                                       }
+                               }
+#endif  /* __EMISSION__ */
+
+                               /* indirect light */
+                               if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+                                                                                         ray_index,
+                                                                                         num_samples_inv,
+                                                                                         bssrdf_sd,
+                                                                                         false,
+                                                                                         false))
+                               {
+                                       branched_state->ss_next_closure = i;
+                                       branched_state->ss_next_sample = j;
+                                       branched_state->next_hit = hit;
+
+                                       return true;
+                               }
+
+                               branched_state->next_closure = 0;
+                       }
+
+                       branched_state->next_hit = 0;
+               }
+
+               branched_state->ss_next_sample = 0;
+       }
+
+       branched_state->ss_next_closure = sd->num_closure;
+
+       branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+       if(branched_state->waiting_on_shared_samples) {
+               return true;
+       }
+
+       kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+       return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+       int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+       if(thread_index == 0) {
+               /* We will empty both queues in this kernel. */
+               kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+               kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
        }
-       ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        ray_index = get_ray_index(kg, ray_index,
                                  QUEUE_ACTIVE_AND_REGENERATED_RAYS,
                                  kernel_split_state.queue_data,
                                  kernel_split_params.queue_size,
-                                 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-       /* If we are executing on a GPU device, we exit all threads that are not
-        * required.
-        *
-        * If we are executing on a CPU device, then we need to keep all threads
-        * active since we have barrier() calls later in the kernel. CPU devices,
-        * expect all threads to execute barrier statement.
-        */
-       if(ray_index == QUEUE_EMPTY_SLOT) {
-               return;
-       }
-#endif
-
-       char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
-       if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+                                 1);
+       get_ray_index(kg, thread_index,
+                     QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+                     kernel_split_state.queue_data,
+                     kernel_split_params.queue_size,
+                     1);
 
+#ifdef __SUBSURFACE__
        ccl_global char *ray_state = kernel_split_state.ray_state;
-       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-       RNG rng = kernel_split_state.rng[ray_index];
-       ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-       ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-       ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-       ShaderData *sd = &kernel_split_state.sd[ray_index];
-       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+               PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+               ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+               ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+               ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+               ShaderData *sd = kernel_split_sd(sd, ray_index);
+               ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
+
                if(sd->flag & SD_BSSRDF) {
-                       if(kernel_path_subsurface_scatter(kg,
-                                                         sd,
-                                                         emission_sd,
-                                                         L,
-                                                         state,
-                                                         &rng,
-                                                         ray,
-                                                         throughput,
-                                                         ss_indirect)) {
-                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                               enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+                       if(!kernel_data.integrator.branched) {
+#endif
+                               if(kernel_path_subsurface_scatter(kg,
+                                                                 sd,
+                                                                 emission_sd,
+                                                                 L,
+                                                                 state,
+                                                                 ray,
+                                                                 throughput,
+                                                                 ss_indirect))
+                               {
+                                       kernel_split_path_end(kg, ray_index);
+                               }
+#ifdef __BRANCHED_PATH__
+                       }
+                       else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                               float bssrdf_u, bssrdf_v;
+                               path_state_rng_2D(kg,
+                                                 state,
+                                                 PRNG_BSDF_U,
+                                                 &bssrdf_u, &bssrdf_v);
+
+                               const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
+
+                               /* do bssrdf scatter step if we picked a bssrdf closure */
+                               if(sc) {
+                                       uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
+                                       subsurface_scatter_step(kg,
+                                                               sd,
+                                                               state,
+                                                               state->flag,
+                                                               sc,
+                                                               &lcg_state,
+                                                               bssrdf_u, bssrdf_v,
+                                                               false);
+                               }
+                       }
+                       else {
+                               kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+                               if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+                               }
                        }
+#endif
                }
-               kernel_split_state.rng[ray_index] = rng;
        }
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __BRANCHED_PATH__
+       if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+               kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
        }
-#endif
 
-       /* Enqueue RAY_UPDATE_BUFFER rays. */
-       enqueue_ray_index_local(ray_index,
-                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                               enqueue_flag,
-                               kernel_split_params.queue_size,
-                               local_queue_atomics,
-                               kernel_split_state.queue_data,
-                               kernel_split_params.queue_index);
+       /* iter loop */
+       ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+                                 QUEUE_SUBSURFACE_INDIRECT_ITER,
+                                 kernel_split_state.queue_data,
+                                 kernel_split_params.queue_size,
+                                 1);
+
+       if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+               /* for render passes, sum and reset indirect light pass variables
+                * for the next samples */
+               path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+               path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+               if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+               }
+       }
+#  endif  /* __BRANCHED_PATH__ */
 
 #endif  /* __SUBSURFACE__ */