Cycles: Branched path tracing for the split kernel
authorMai Lavelle <mai.lavelle@gmail.com>
Tue, 21 Mar 2017 02:31:54 +0000 (22:31 -0400)
committerMai Lavelle <mai.lavelle@gmail.com>
Tue, 2 May 2017 18:26:46 +0000 (14:26 -0400)
This implements branched path tracing for the split kernel.

General approach is to store the ray state at a branch point, trace the
branched ray as normal, then restore the state as necessary before iterating
to the next part of the path. A state machine is used to advance the indirect
loop state, which avoids the need to add any new kernels. Each iteration the
state machine recreates as much state as possible from the stored ray to keep
overall storage down.

Its kind of hard to keep all the different integration loops in sync, so this
needs lots of testing to make sure everything is working correctly. We should
probably start trying to deduplicate the integration loops more now.

Nonbranched BMW is ~2% slower, while classroom is ~2% faster, other scenes
could use more testing still.

Reviewers: sergey, nirved

Reviewed By: nirved

Subscribers: Blendify, bliblubli

Differential Revision: https://developer.blender.org/D2611

25 files changed:
intern/cycles/blender/addon/ui.py
intern/cycles/device/device_split_kernel.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_branched.h
intern/cycles/kernel/kernel_path_surface.h
intern/cycles/kernel/kernel_subsurface.h
intern/cycles/kernel/kernel_types.h
intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
intern/cycles/kernel/kernels/cuda/kernel_split.cu
intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
intern/cycles/kernel/split/kernel_branched.h [new file with mode: 0644]
intern/cycles/kernel/split/kernel_data_init.h
intern/cycles/kernel/split/kernel_direct_lighting.h
intern/cycles/kernel/split/kernel_do_volume.h
intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
intern/cycles/kernel/split/kernel_indirect_background.h
intern/cycles/kernel/split/kernel_indirect_subsurface.h
intern/cycles/kernel/split/kernel_next_iteration_setup.h
intern/cycles/kernel/split/kernel_shader_eval.h
intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
intern/cycles/kernel/split/kernel_split_common.h
intern/cycles/kernel/split/kernel_split_data_types.h
intern/cycles/kernel/split/kernel_subsurface_scatter.h

index 19ab905..ab1fe89 100644 (file)
@@ -78,7 +78,7 @@ def use_cuda(context):
 def use_branched_path(context):
     cscene = context.scene.cycles
 
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))
+    return (cscene.progressive == 'BRANCHED_PATH')
 
 
 def use_sample_all_lights(context):
@@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         row = layout.row()
         sub = row.row()
-        sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
         sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
index 71d52bb..abd26f5 100644 (file)
@@ -240,6 +240,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
                                ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
index 3750225..a92e8bc 100644 (file)
@@ -235,6 +235,7 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
+       split/kernel_branched.h
        split/kernel_buffer_update.h
        split/kernel_data_init.h
        split/kernel_direct_lighting.h
index e795704..58da141 100644 (file)
@@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ShaderData *emission_sd,
                                         PathRadiance *L,
-                                        PathState *state,
+                                        ccl_addr_space PathState *state,
                                         RNG *rng,
                                         float3 throughput,
                                         float3 ao_alpha)
@@ -98,6 +98,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
        }
 }
 
+#ifndef __SPLIT_KERNEL__
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
@@ -818,5 +820,7 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
        path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
 
index 085eb42..ddcb571 100644 (file)
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *sd,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
-                                               PathState *state,
+                                               ccl_addr_space PathState *state,
                                                RNG *rng,
                                                float3 throughput)
 {
@@ -65,6 +65,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
        }
 }
 
+#ifndef __SPLIT_KERNEL__
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
@@ -648,6 +649,8 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
        path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
index 076c82f..bd4ba77 100644 (file)
@@ -155,7 +155,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
         ccl_addr_space float3 *throughput,
         ccl_addr_space PathState *state,
         PathRadiance *L,
-        Ray *ray)
+        ccl_addr_space Ray *ray)
 {
        /* sample BSDF */
        float bsdf_pdf;
index baf6293..274713a 100644 (file)
@@ -417,9 +417,8 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
        subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }
 
-#ifndef __SPLIT_KERNEL__
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_global PathState *state,
        int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
        float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -507,7 +506,6 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
        /* setup diffuse bsdf */
        subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
-#endif /* ! __SPLIT_KERNEL__ */
 
 CCL_NAMESPACE_END
 
index b1269cd..6417f62 100644 (file)
@@ -71,22 +71,18 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  ifndef __SPLIT_KERNEL__
-#    define __BRANCHED_PATH__
-#  endif
+#  define __BRANCHED_PATH__
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __SUBSURFACE__
 #  define __PRINCIPLED__
+#  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
-#  ifndef __SPLIT_KERNEL__
-#    define __VOLUME_DECOUPLED__
-#    define __VOLUME_RECORD_ALL__
-#  endif
+#  define __VOLUME_DECOUPLED__
+#  define __VOLUME_RECORD_ALL__
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
@@ -138,6 +134,7 @@ CCL_NAMESPACE_BEGIN
 #    define __VOLUME_SCATTER__
 #    define __SHADOW_RECORD_ALL__
 #    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
@@ -1300,7 +1297,6 @@ typedef ccl_addr_space struct DebugData {
  * Queue 3 - Shadow ray cast kernel - AO
  * Queeu 4 - Shadow ray cast kernel - direct lighting
  */
-#define NUM_QUEUES 4
 
 /* Queue names */
 enum QueueNumber {
@@ -1313,22 +1309,37 @@ enum QueueNumber {
         * 3. Rays to be regenerated
         * are enqueued here.
         */
-       QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+       QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 
        /* All rays for which a shadow ray should be cast to determine radiance
         * contribution for AO are enqueued here.
         */
-       QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+       QUEUE_SHADOW_RAY_CAST_AO_RAYS,
 
        /* All rays for which a shadow ray should be cast to determine radiance
         * contributing for direct lighting are enqueued here.
         */
-       QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+       QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+#ifdef __BRANCHED_PATH__
+       /* All rays moving to next iteration of the indirect loop for light */
+       QUEUE_LIGHT_INDIRECT_ITER,
+#  ifdef __VOLUME__
+       /* All rays moving to next iteration of the indirect loop for volumes */
+       QUEUE_VOLUME_INDIRECT_ITER,
+#  endif
+#  ifdef __SUBSURFACE__
+       /* All rays moving to next iteration of the indirect loop for subsurface */
+       QUEUE_SUBSURFACE_INDIRECT_ITER,
+#  endif
+#endif  /* __BRANCHED_PATH__ */
+
+       NUM_QUEUES
 };
 
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
 enum RayState {
        RAY_INVALID = 0,
        /* Denotes ray is actively involved in path-iteration. */
@@ -1343,14 +1354,22 @@ enum RayState {
        RAY_TO_REGENERATE,
        /* Denotes ray has been regenerated */
        RAY_REGENERATED,
-       /* Flag's ray has to execute shadow blocked function in AO part */
-       RAY_SHADOW_RAY_CAST_AO = 16,
-       /* Flag's ray has to execute shadow blocked function in direct lighting part. */
-       RAY_SHADOW_RAY_CAST_DL = 32,
+       /* Denotes ray is moving to next iteration of the branched indirect loop */
+       RAY_LIGHT_INDIRECT_NEXT_ITER,
+       RAY_VOLUME_INDIRECT_NEXT_ITER,
+       RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+       /* Ray flags */
+
+       /* Flags to denote that the ray is currently evaluating the branched indirect loop */
+       RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+       RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+       RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+       RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
 };
 
 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
 #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
 #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
 #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
index 148b2ee..96f54bb 100644 (file)
@@ -183,7 +183,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
index a679eff..585b918 100644 (file)
@@ -110,7 +110,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
index 7a1838e..99b74a1 100644 (file)
@@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_subsurface_scatter(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       ccl_local unsigned int local_queue_atomics;
-       kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
+       kernel_subsurface_scatter((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644 (file)
index 0000000..c7bc1b4
--- /dev/null
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       /* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+               branched_state->name = kernel_split_state.name[ray_index];
+
+       BRANCHED_STORE(path_state);
+       BRANCHED_STORE(throughput);
+       BRANCHED_STORE(ray);
+       BRANCHED_STORE(sd);
+       BRANCHED_STORE(isect);
+       BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+       /* set loop counters to intial position */
+       branched_state->next_closure = 0;
+       branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       /* restore state */
+#define BRANCHED_RESTORE(name) \
+               kernel_split_state.name[ray_index] = branched_state->name;
+
+       BRANCHED_RESTORE(path_state);
+       BRANCHED_RESTORE(throughput);
+       BRANCHED_RESTORE(ray);
+       BRANCHED_RESTORE(sd);
+       BRANCHED_RESTORE(isect);
+       BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+       /* leave indirect loop */
+       REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+                                                                                int ray_index,
+                                                                                float num_samples_adjust,
+                                                                                ShaderData *saved_sd,
+                                                                                bool reset_path_state)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       ShaderData *sd = saved_sd;
+       RNG rng = kernel_split_state.rng[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       float3 throughput = branched_state->throughput;
+
+       for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+               const ShaderClosure *sc = &sd->closure[i];
+
+               if(!CLOSURE_IS_BSDF(sc->type))
+                       continue;
+               /* transparency is not handled here, but in outer loop */
+               if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+                       continue;
+
+               int num_samples;
+
+               if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+                       num_samples = kernel_data.integrator.diffuse_samples;
+               else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+                       num_samples = 1;
+               else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+                       num_samples = kernel_data.integrator.glossy_samples;
+               else
+                       num_samples = kernel_data.integrator.transmission_samples;
+
+               num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+               float num_samples_inv = num_samples_adjust/num_samples;
+               RNG bsdf_rng = cmj_hash(rng, i);
+
+               for(int j = branched_state->next_sample; j < num_samples; j++) {
+                       ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+                       if(reset_path_state) {
+                               *ps = branched_state->path_state;
+                       }
+
+                       ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+                       *tp = throughput;
+
+                       ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+                       if(!kernel_branched_path_surface_bounce(kg,
+                                                               &bsdf_rng,
+                                                               sd,
+                                                               sc,
+                                                               j,
+                                                               num_samples,
+                                                               tp,
+                                                               ps,
+                                                               L,
+                                                               bsdf_ray))
+                       {
+                               continue;
+                       }
+
+                       /* update state for next iteration */
+                       branched_state->next_closure = i;
+                       branched_state->next_sample = j+1;
+                       branched_state->num_samples = num_samples;
+
+                       /* start the indirect path */
+                       *tp *= num_samples_inv;
+
+                       return true;
+               }
+
+               branched_state->next_sample = 0;
+       }
+
+       return false;
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
index 9d3d01f..642ccac 100644 (file)
@@ -105,21 +105,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 
        /* Initialize queue data and queue index. */
        if(thread_index < queuesize) {
-               /* Initialize active ray queue. */
-               kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-               /* Initialize background and buffer update queue. */
-               kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-               /* Initialize shadow ray cast of AO queue. */
-               kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-               /* Initialize shadow ray cast of direct lighting queue. */
-               kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+               for(int i = 0; i < NUM_QUEUES; i++) {
+                       kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+               }
        }
 
        if(thread_index == 0) {
-               Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-               Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-               Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-               Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+               for(int i = 0; i < NUM_QUEUES; i++) {
+                       Queue_index[i] = 0;
+               }
+
                /* The scene-intersect kernel should not use the queues very first time.
                 * since the queue would be empty.
                 */
index bdbf738..3336c96 100644 (file)
@@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
                                  kernel_split_params.queue_size,
                                  0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-       /* If we are executing on a GPU device, we exit all threads that are not
-        * required.
-        *
-        * If we are executing on a CPU device, then we need to keep all threads
-        * active since we have barrier() calls later in the kernel. CPU devices,
-        * expect all threads to execute barrier statement.
-        */
-       if(ray_index == QUEUE_EMPTY_SLOT) {
-               return;
-       }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-       if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
        if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
                ShaderData *sd = &kernel_split_state.sd[ray_index];
@@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
                /* direct lighting */
 #ifdef __EMISSION__
                RNG rng = kernel_split_state.rng[ray_index];
+
                bool flag = (kernel_data.integrator.use_direct_light &&
                             (sd->flag & SD_BSDF_HAS_EVAL));
+
+#  ifdef __BRANCHED_PATH__
+               if(flag && kernel_data.integrator.branched) {
+                       flag = false;
+                       enqueue_flag = 1;
+               }
+#  endif  /* __BRANCHED_PATH__ */
+
 #  ifdef __SHADOW_TRICKS__
                if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
                        flag = false;
-                       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
-                       float3 throughput = kernel_split_state.throughput[ray_index];
-                       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-                       kernel_branched_path_surface_connect_light(kg,
-                                                                  &rng,
-                                                                  sd,
-                                                                  emission_sd,
-                                                                  state,
-                                                                  throughput,
-                                                                  1.0f,
-                                                                  L,
-                                                                  1);
+                       enqueue_flag = 1;
                }
 #  endif  /* __SHADOW_TRICKS__ */
+
                if(flag) {
                        /* Sample illumination from lights to find path contribution. */
                        float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
@@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
                                        kernel_split_state.bsdf_eval[ray_index] = L_light;
                                        kernel_split_state.is_lamp[ray_index] = is_lamp;
                                        /* Mark ray state for next shadow kernel. */
-                                       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
                                        enqueue_flag = 1;
                                }
                        }
@@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 #endif  /* __EMISSION__ */
        }
 
-#ifndef __COMPUTE_DEVICE_GPU__
-       }
-#endif
-
 #ifdef __EMISSION__
        /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
        enqueue_ray_index_local(ray_index,
@@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
 #endif
+
+#ifdef __BRANCHED_PATH__
+       /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+        * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+        */
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+               *local_queue_atomics = 0;
+       }
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+       ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+       enqueue_ray_index_local(ray_index,
+                               QUEUE_LIGHT_INDIRECT_ITER,
+                               IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+                               kernel_split_params.queue_size,
+                               local_queue_atomics,
+                               kernel_split_state.queue_data,
+                               kernel_split_params.queue_index);
+
+#endif  /* __BRANCHED_PATH__ */
 }
 
 CCL_NAMESPACE_END
index 47d3c28..182e6c6 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+       kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       RNG rng = kernel_split_state.rng[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+       /* GPU: no decoupled ray marching, scatter probalistically */
+       int num_samples = kernel_data.integrator.volume_samples;
+       float num_samples_inv = 1.0f/num_samples;
+
+       Ray volume_ray = branched_state->ray;
+       volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+       bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+       for(int j = branched_state->next_sample; j < num_samples; j++) {
+               ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+               *ps = branched_state->path_state;
+
+               ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+               *pray = branched_state->ray;
+
+               ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+               *tp = branched_state->throughput * num_samples_inv;
+
+               /* branch RNG state */
+               path_state_branch(ps, j, num_samples);
+
+               /* integrate along volume segment with distance sampling */
+               VolumeIntegrateResult result = kernel_volume_integrate(
+                       kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+               if(result == VOLUME_PATH_SCATTERED) {
+                       /* direct lighting */
+                       kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+                       /* indirect light bounce */
+                       if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+                               continue;
+                       }
+
+                       /* start the indirect path */
+                       branched_state->next_closure = 0;
+                       branched_state->next_sample = j+1;
+                       branched_state->num_samples = num_samples;
+
+                       return true;
+               }
+#  endif
+       }
+
+       kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+       /* todo: avoid this calculation using decoupled ray marching */
+       float3 throughput = kernel_split_state.throughput[ray_index];
+       kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+       kernel_split_state.throughput[ray_index] = throughput;
+
+       return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __VOLUME__ */
 
 ccl_device void kernel_do_volume(KernelGlobals *kg)
 {
@@ -23,37 +98,37 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
        /* We will empty this queue in this kernel. */
        if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
                kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+#  ifdef __BRANCHED_PATH__
+               kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+#  endif  /* __BRANCHED_PATH__ */
        }
-       /* Fetch use_queues_flag. */
-       char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-       ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-       if(local_use_queues_flag) {
+
+       if(*kernel_split_params.use_queues_flag) {
                ray_index = get_ray_index(kg, ray_index,
                                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
                                          kernel_split_state.queue_data,
                                          kernel_split_params.queue_size,
                                          1);
-               if(ray_index == QUEUE_EMPTY_SLOT) {
-                       return;
-               }
        }
 
-       if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-          IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+       ccl_global char *ray_state = kernel_split_state.ray_state;
 
-               bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-               PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+       ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+       ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+       RNG rng = kernel_split_state.rng[ray_index];
+       ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
-               ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-               ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-               RNG rng = kernel_split_state.rng[ray_index];
-               ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-               ShaderData *sd = &kernel_split_state.sd[ray_index];
-               ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+       if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+          IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+
+               bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
 
                /* Sanitize volume stack. */
                if(!hit) {
@@ -64,31 +139,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
                        Ray volume_ray = *ray;
                        volume_ray.t = (hit)? isect->t: FLT_MAX;
 
-                       bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+#  ifdef __BRANCHED_PATH__
+                       if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#  endif  /* __BRANCHED_PATH__ */
+                               bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 
-                       {
-                               /* integrate along volume segment with distance sampling */
-                               VolumeIntegrateResult result = kernel_volume_integrate(
-                                       kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+                               {
+                                       /* integrate along volume segment with distance sampling */
+                                       VolumeIntegrateResult result = kernel_volume_integrate(
+                                               kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
-                               if(result == VOLUME_PATH_SCATTERED) {
-                                       /* direct lighting */
-                                       kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
-
-                                       /* indirect light bounce */
-                                       if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
-                                               ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
-                                       else
-                                               ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+                                       if(result == VOLUME_PATH_SCATTERED) {
+                                               /* direct lighting */
+                                               kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+
+                                               /* indirect light bounce */
+                                               if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+                                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+                                               }
+                                               else {
+                                                       kernel_split_path_end(kg, ray_index);
+                                               }
+                                       }
+#  endif  /* __VOLUME_SCATTER__ */
+                               }
+
+#  ifdef __BRANCHED_PATH__
+                       }
+                       else {
+                               kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+                               if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
                                }
-#  endif
                        }
+#  endif  /* __BRANCHED_PATH__ */
                }
+
                kernel_split_state.rng[ray_index] = rng;
        }
 
-#endif
+#  ifdef __BRANCHED_PATH__
+       /* iter loop */
+       ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+                                 QUEUE_VOLUME_INDIRECT_ITER,
+                                 kernel_split_state.queue_data,
+                                 kernel_split_params.queue_size,
+                                 1);
+
+       if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+               /* for render passes, sum and reset indirect light pass variables
+                * for the next samples */
+               path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+               path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+               if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+               }
+       }
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __VOLUME__ */
 }
 
 
index 89adeb6..8749891 100644 (file)
@@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
  *     flag RAY_SHADOW_RAY_CAST_AO
  */
+
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         KernelGlobals *kg,
         ccl_local_param BackgroundAOLocals *locals)
@@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
+#ifdef __AO__
        char enqueue_flag = 0;
-       char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+#endif
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        ray_index = get_ray_index(kg, ray_index,
                                  QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -155,8 +157,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
                                kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
                        }
                        if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                               enqueue_flag = 1;
+                               kernel_split_path_end(kg, ray_index);
                        }
                }
 #endif  /* __HOLDOUT__ */
@@ -164,18 +165,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-               /* Holdout mask objects do not write data passes. */
-               kernel_write_data_passes(kg,
-                                        buffer,
-                                        L,
-                                        sd,
-                                        sample,
-                                        state,
-                                        throughput);
+
+#ifdef __BRANCHED_PATH__
+               if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
+#endif  /* __BRANCHED_PATH__ */
+               {
+                       /* Holdout mask objects do not write data passes. */
+                       kernel_write_data_passes(kg,
+                                                    buffer,
+                                                    L,
+                                                    sd,
+                                                    sample,
+                                                    state,
+                                                    throughput);
+               }
+
                /* Blurring of bsdf after bounces, for rays that have a small likelihood
                 * of following this particular path (diffuse, rough glossy.
                 */
-               if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+#ifndef __BRANCHED_PATH__
+               if(kernel_data.integrator.filter_glossy != FLT_MAX)
+#else
+               if(kernel_data.integrator.filter_glossy != FLT_MAX &&
+                  (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
+#endif  /* __BRANCHED_PATH__ */
+               {
                        float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
                        if(blur_pdf < 1.0f) {
                                float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@@ -201,19 +215,32 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
                 * mainly due to the mixed in MIS that we use. gives too many unneeded
                 * shader evaluations, only need emission if we are going to terminate.
                 */
+#ifndef __BRANCHED_PATH__
                float probability = path_state_terminate_probability(kg, state, throughput);
+#else
+               float probability = 1.0f;
+
+               if(!kernel_data.integrator.branched) {
+                       probability = path_state_terminate_probability(kg, state, throughput);
+               }
+               else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                       int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
+                       probability = path_state_terminate_probability(kg, state, throughput*num_samples);
+               }
+               else if(state->flag & PATH_RAY_TRANSPARENT) {
+                       probability = path_state_terminate_probability(kg, state, throughput);
+               }
+#endif
 
                if(probability == 0.0f) {
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                       enqueue_flag = 1;
+                       kernel_split_path_end(kg, ray_index);
                }
 
                if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                        if(probability != 1.0f) {
                                float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
                                if(terminate >= probability) {
-                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                                       enqueue_flag = 1;
+                                       kernel_split_path_end(kg, ray_index);
                                }
                                else {
                                        kernel_split_state.throughput[ray_index] = throughput/probability;
@@ -225,61 +252,23 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 #ifdef __AO__
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                /* ambient occlusion */
-               if(kernel_data.integrator.use_ambient_occlusion ||
-                  (sd->flag & SD_AO))
-               {
-                       /* todo: solve correlation */
-                       float bsdf_u, bsdf_v;
-                       path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-                       float ao_factor = kernel_data.background.ao_factor;
-                       float3 ao_N;
-                       kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-                       kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
-
-                       float3 ao_D;
-                       float ao_pdf;
-                       sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-                       if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-                               Ray _ray;
-                               _ray.P = ray_offset(sd->P, sd->Ng);
-                               _ray.D = ao_D;
-                               _ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-                               _ray.time = sd->time;
-#endif
-                               _ray.dP = sd->dP;
-                               _ray.dD = differential3_zero();
-                               kernel_split_state.ao_light_ray[ray_index] = _ray;
-
-                               ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-                               enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-                       }
+               if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+                       enqueue_flag = 1;
                }
        }
 #endif  /* __AO__ */
-       kernel_split_state.rng[ray_index] = rng;
 
+       kernel_split_state.rng[ray_index] = rng;
 
 #ifndef __COMPUTE_DEVICE_GPU__
        }
 #endif
 
-       /* Enqueue RAY_UPDATE_BUFFER rays. */
-       enqueue_ray_index_local(ray_index,
-                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                               enqueue_flag,
-                               kernel_split_params.queue_size,
-                               &locals->queue_atomics_bg,
-                               kernel_split_state.queue_data,
-                               kernel_split_params.queue_index);
-
 #ifdef __AO__
        /* Enqueue to-shadow-ray-cast rays. */
        enqueue_ray_index_local(ray_index,
                                QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                               enqueue_flag_AO_SHADOW_RAY_CAST,
+                               enqueue_flag,
                                kernel_split_params.queue_size,
                                &locals->queue_atomics_ao,
                                kernel_split_state.queue_data,
index 8192528..6fbc888 100644 (file)
@@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
                        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
                                if(state->bounce > kernel_data.integrator.ao_bounces) {
-                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                                       kernel_split_path_end(kg, ray_index);
                                }
                        }
                }
@@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 #ifdef __PASSES__
                        if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
 #endif
-                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                               kernel_split_path_end(kg, ray_index);
                }
 
                if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
@@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
                        float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
                        path_radiance_accum_background(L, state, (*throughput), L_background);
 #endif
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                       kernel_split_path_end(kg, ray_index);
                }
        }
 
index a56e85a..82bc2f0 100644 (file)
@@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
        ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
        ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 
-       if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-               ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-               kernel_path_subsurface_accum_indirect(ss_indirect, L);
+#ifdef __BRANCHED_PATH__
+       if(!kernel_data.integrator.branched) {
+#endif
+               if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+                       ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+                       kernel_path_subsurface_accum_indirect(ss_indirect, L);
 
-               /* Trace indirect subsurface rays by restarting the loop. this uses less
-                * stack memory than invoking kernel_path_indirect.
-                */
-               if(ss_indirect->num_rays) {
-                       kernel_path_subsurface_setup_indirect(kg,
-                                                             ss_indirect,
-                                                             state,
-                                                             ray,
-                                                             L,
-                                                             throughput);
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-               }
-               else {
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                       /* Trace indirect subsurface rays by restarting the loop. this uses less
+                        * stack memory than invoking kernel_path_indirect.
+                        */
+                       if(ss_indirect->num_rays) {
+                               kernel_path_subsurface_setup_indirect(kg,
+                                                                         ss_indirect,
+                                                                         state,
+                                                                         ray,
+                                                                         L,
+                                                                         throughput);
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+                       }
                }
+#ifdef __BRANCHED_PATH__
        }
+#endif
 
 #endif  /* __SUBSURFACE__ */
 
index 1bebc16..71017fe 100644 (file)
@@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
  *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+       kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+       kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+       ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+       ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+       /* continue in case of transparency */
+       *throughput *= shader_bsdf_transparency(kg, sd);
+
+       if(is_zero(*throughput)) {
+               kernel_split_path_end(kg, ray_index);
+       }
+       else {
+               /* Update Path State */
+               state->flag |= PATH_RAY_TRANSPARENT;
+               state->transparent_bounce++;
+
+               ray->P = ray_offset(sd->P, -sd->Ng);
+               ray->t -= sd->ray_length; /* clipping works through transparent */
+
+#  ifdef __RAY_DIFFERENTIALS__
+               ray->dP = sd->dP;
+               ray->dD.dx = -sd->dI.dx;
+               ray->dD.dy = -sd->dI.dy;
+#  endif  /* __RAY_DIFFERENTIALS__ */
+
+#  ifdef __VOLUME__
+               /* enter/exit volume */
+               kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#  endif  /* __VOLUME__ */
+       }
+}
+#endif  /* __BRANCHED_PATH__ */
+
 ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                                             ccl_local_param unsigned int *local_queue_atomics)
 {
@@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
        }
 
-       char enqueue_flag = 0;
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        ray_index = get_ray_index(kg, ray_index,
                                  QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -75,102 +120,125 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                                  kernel_split_params.queue_size,
                                  0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-       /* If we are executing on a GPU device, we exit all threads that are not
-        * required.
-        *
-        * If we are executing on a CPU device, then we need to keep all threads
-        * active since we have barrier() calls later in the kernel. CPU devices,
-        * expect all threads to execute barrier statement.
-        */
-       if(ray_index == QUEUE_EMPTY_SLOT) {
-               return;
-       }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-       if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
-       /* Load ShaderData structure. */
-       PathRadiance *L = NULL;
-       ccl_global PathState *state = NULL;
        ccl_global char *ray_state = kernel_split_state.ray_state;
 
-       /* Path radiance update for AO/Direct_lighting's shadow blocked. */
-       if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-          IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-       {
-               state = &kernel_split_state.path_state[ray_index];
-               L = &kernel_split_state.path_radiance[ray_index];
-               float3 _throughput = kernel_split_state.throughput[ray_index];
-
-               if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-                       float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
-                       // TODO(mai): investigate correctness here
-                       char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
-                       if(update_path_radiance) {
-                               path_radiance_accum_ao(L,
-                                                      _throughput,
-                                                      kernel_split_state.ao_alpha[ray_index],
-                                                      kernel_split_state.ao_bsdf[ray_index],
-                                                      shadow,
-                                                      state->bounce);
-                       }
-                       else {
-                               path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
+       bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+       if(active) {
+               ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+               ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+               RNG rng = kernel_split_state.rng[ray_index];
+               ShaderData *sd = &kernel_split_state.sd[ray_index];
+               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+               PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+               if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+                       /* Compute direct lighting and next bounce. */
+                       if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+                               kernel_split_path_end(kg, ray_index);
                        }
-                       REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
                }
-
-               if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-                       float3 shadow = kernel_split_state.light_ray[ray_index].P;
-                       // TODO(mai): investigate correctness here
-                       char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
-                       BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-                       if(update_path_radiance) {
-                               path_radiance_accum_light(L,
-                                                         _throughput,
-                                                         &L_light,
-                                                         shadow,
-                                                         1.0f,
-                                                         state->bounce,
-                                                         kernel_split_state.is_lamp[ray_index]);
+               else {
+                       kernel_split_branched_indirect_light_init(kg, ray_index);
+
+                       if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+                                                                                 ray_index,
+                                                                                 1.0f,
+                                                                                 &kernel_split_state.branched_state[ray_index].sd,
+                                                                                 true))
+                       {
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
                        }
                        else {
-                               path_radiance_accum_total_light(L, _throughput, &L_light);
+                               kernel_split_branched_indirect_light_end(kg, ray_index);
                        }
-                       REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
                }
+#endif  /* __BRANCHED_PATH__ */
+
+               kernel_split_state.rng[ray_index] = rng;
        }
 
-       if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-               ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-               ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-               RNG rng = kernel_split_state.rng[ray_index];
-               state = &kernel_split_state.path_state[ray_index];
-               L = &kernel_split_state.path_radiance[ray_index];
+       /* Enqueue RAY_UPDATE_BUFFER rays. */
+       enqueue_ray_index_local(ray_index,
+                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+                               IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+                               kernel_split_params.queue_size,
+                               local_queue_atomics,
+                               kernel_split_state.queue_data,
+                               kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+       /* iter loop */
+       if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+               kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+       }
 
-               /* Compute direct lighting and next bounce. */
-               if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                       enqueue_flag = 1;
+       ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+                                 QUEUE_LIGHT_INDIRECT_ITER,
+                                 kernel_split_state.queue_data,
+                                 kernel_split_params.queue_size,
+                                 1);
+
+       if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+               /* for render passes, sum and reset indirect light pass variables
+                * for the next samples */
+               PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+               path_radiance_sum_indirect(L);
+               path_radiance_reset_indirect(L);
+
+               if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+                                                                         ray_index,
+                                                                         1.0f,
+                                                                         &kernel_split_state.branched_state[ray_index].sd,
+                                                                         true))
+               {
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+               }
+               else {
+                       kernel_split_branched_indirect_light_end(kg, ray_index);
                }
-               kernel_split_state.rng[ray_index] = rng;
        }
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __VOLUME__
+       /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+               *local_queue_atomics = 0;
        }
-#endif
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-       /* Enqueue RAY_UPDATE_BUFFER rays. */
+       ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        enqueue_ray_index_local(ray_index,
-                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                               enqueue_flag,
+                               QUEUE_VOLUME_INDIRECT_ITER,
+                               IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+                               kernel_split_params.queue_size,
+                               local_queue_atomics,
+                               kernel_split_state.queue_data,
+                               kernel_split_params.queue_index);
+
+#  endif  /* __VOLUME__ */
+
+#  ifdef __SUBSURFACE__
+       /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+               *local_queue_atomics = 0;
+       }
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+       ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+       enqueue_ray_index_local(ray_index,
+                               QUEUE_SUBSURFACE_INDIRECT_ITER,
+                               IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
                                kernel_split_params.queue_size,
                                local_queue_atomics,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
+#  endif  /* __SUBSURFACE__ */
+#endif  /* __BRANCHED_PATH__ */
 }
 
 CCL_NAMESPACE_END
index 0f1696e..957d709 100644 (file)
@@ -38,8 +38,10 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
                                  kernel_split_params.queue_size,
                                  0);
 
+       ccl_global char *ray_state = kernel_split_state.ray_state;
+
        char enqueue_flag = 0;
-       if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
+       if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
                enqueue_flag = 1;
        }
 
@@ -52,7 +54,7 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
                                kernel_split_params.queue_index);
 
        /* Continue on with shader evaluation. */
-       if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+       if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                Intersection isect = kernel_split_state.isect[ray_index];
                RNG rng = kernel_split_state.rng[ray_index];
                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
@@ -62,8 +64,27 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
                                      &kernel_split_state.sd[ray_index],
                                      &isect,
                                      &ray);
+
+#ifndef __BRANCHED_PATH__
                float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
                shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+#else
+               ShaderContext ctx = SHADER_CONTEXT_MAIN;
+               float rbsdf = 0.0f;
+
+               if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                       rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+
+               }
+
+               if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                       ctx = SHADER_CONTEXT_INDIRECT;
+               }
+
+               shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
+               shader_merge_closures(&kernel_split_state.sd[ray_index]);
+#endif  /* __BRANCHED_PATH__ */
+
                kernel_split_state.rng[ray_index] = rng;
        }
 }
index 4243e18..4742862 100644 (file)
@@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
                                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
        }
 
-       if(ray_index == QUEUE_EMPTY_SLOT)
+       if(ray_index == QUEUE_EMPTY_SLOT) {
                return;
+       }
 
-       /* Flag determining if we need to update L. */
-       char update_path_radiance = 0;
-
-       if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-               ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
-
-               float3 shadow;
-               Ray ray = *light_ray_global;
-               update_path_radiance = !(shadow_blocked(kg,
-                                                       &kernel_split_state.sd_DL_shadow[ray_index],
-                                                       state,
-                                                       &ray,
-                                                       &shadow));
-
-               *light_ray_global = ray;
-               /* We use light_ray_global's P and t to store shadow and
-                * update_path_radiance.
-                */
-               light_ray_global->P = shadow;
-               light_ray_global->t = update_path_radiance;
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+       RNG rng = kernel_split_state.rng[ray_index];
+       float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+       if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+               kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+       }
+       else {
+               kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
        }
+#endif
+
+       kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
index bb8f015..452b6e4 100644 (file)
@@ -32,28 +32,71 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
        if(ray_index == QUEUE_EMPTY_SLOT)
                return;
 
-       /* Flag determining if we need to update L. */
-       char update_path_radiance = 0;
+       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+       Ray ray = kernel_split_state.light_ray[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       float3 throughput = kernel_split_state.throughput[ray_index];
+       RNG rng = kernel_split_state.rng[ray_index];
 
-       if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-               ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+       BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+       bool is_lamp = kernel_split_state.is_lamp[ray_index];
 
+#  if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+       bool use_branched = false;
+       int all = 0;
+
+       if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+               use_branched = true;
+               all = 1;
+       }
+#    if defined(__BRANCHED_PATH__)
+       else if(kernel_data.integrator.branched) {
+               use_branched = true;
+
+               if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                       all = (kernel_data.integrator.sample_all_lights_indirect);
+               }
+               else
+               {
+                       all = (kernel_data.integrator.sample_all_lights_direct);
+               }
+       }
+#    endif  /* __BRANCHED_PATH__ */
+
+       if(use_branched) {
+               kernel_branched_path_surface_connect_light(kg,
+                                                          &rng,
+                                                          sd,
+                                                          emission_sd,
+                                                          state,
+                                                          throughput,
+                                                          1.0f,
+                                                          L,
+                                                          all);
+       }
+       else
+#  endif  /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+       {
+               /* trace shadow ray */
                float3 shadow;
-               Ray ray = *light_ray_global;
-               update_path_radiance = !(shadow_blocked(kg,
-                                                       &kernel_split_state.sd_DL_shadow[ray_index],
-                                                       state,
-                                                       &ray,
-                                                       &shadow));
-
-               *light_ray_global = ray;
-               /* We use light_ray_global's P and t to store shadow and
-                * update_path_radiance.
-                */
-               light_ray_global->P = shadow;
-               light_ray_global->t = update_path_radiance;
+
+               if(!shadow_blocked(kg,
+                                      emission_sd,
+                                      state,
+                                      &ray,
+                                      &shadow))
+               {
+                       /* accumulate */
+                       path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+               }
+               else {
+                       path_radiance_accum_total_light(L, throughput, &L_light);
+               }
        }
+
+       kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
index 4303ba0..57f070d 100644 (file)
 
 #include "util/util_atomic.h"
 
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/kernel_path_branched.h"
 #endif
 
-#ifdef __VOLUME__
-#  include "kernel/kernel_volume.h"
-#endif
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
 
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/split/kernel_branched.h"
+#endif
 
-#ifdef __KERNEL_DEBUG__
-#  include "kernel/kernel_debug.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+       ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+       if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+       }
+       else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+       }
+       else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+       }
+       else {
+               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+       }
+#else
+       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 #endif
+}
 
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_H__ */
index 0af8bfc..a2cb4f6 100644 (file)
@@ -62,7 +62,46 @@ typedef struct SplitParams {
        SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
 #else
 #  define SPLIT_DATA_DEBUG_ENTRIES
-#endif
+#endif  /* DEBUG */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+       /* various state that must be kept and restored after an indirect loop */
+       PathState path_state;
+       float3 throughput;
+       Ray ray;
+
+       struct ShaderData sd;
+       Intersection isect;
+
+       char ray_state;
+
+       /* indirect loop state */
+       int next_closure;
+       int next_sample;
+       int num_samples;
+
+#ifdef __SUBSURFACE__
+       int ss_next_closure;
+       int ss_next_sample;
+       int next_hit;
+       int num_hits;
+
+       uint lcg_state;
+       SubsurfaceIntersection ss_isect;
+
+#  ifdef __VOLUME__
+       VolumeStack volume_stack[VOLUME_STACK_SIZE];
+#  endif  /* __VOLUME__ */
+#endif  /*__SUBSURFACE__ */
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+       SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif  /* __BRANCHED_PATH__ */
 
 #define SPLIT_DATA_ENTRIES \
        SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
@@ -72,9 +111,6 @@ typedef struct SplitParams {
        SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
        SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
        SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-       SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
-       SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
-       SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
        SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
        SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
        SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
@@ -82,6 +118,7 @@ typedef struct SplitParams {
        SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
        SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
        SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+       SPLIT_DATA_BRANCHED_ENTRIES \
        SPLIT_DATA_DEBUG_ENTRIES \
 
 /* struct that holds pointers to data in the shared state buffer */
index 0b4d50c..8364f18 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
 
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
-                                          ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
 {
-#ifdef __SUBSURFACE__
-       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               *local_queue_atomics = 0;
+       kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       branched_state->ss_next_closure = 0;
+       branched_state->ss_next_sample = 0;
+
+       branched_state->num_hits = 0;
+       branched_state->next_hit = 0;
+
+       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       ShaderData *sd = &branched_state->sd;
+       RNG rng = kernel_split_state.rng[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+       for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+               ShaderClosure *sc = &sd->closure[i];
+
+               if(!CLOSURE_IS_BSSRDF(sc->type))
+                       continue;
+
+               /* set up random number generator */
+               if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+                  branched_state->next_closure == 0 && branched_state->next_sample == 0)
+               {
+                       branched_state->lcg_state = lcg_state_init(&rng,
+                                                                  branched_state->path_state.rng_offset,
+                                                                  branched_state->path_state.sample,
+                                                                  0x68bc21eb);
+               }
+               int num_samples = kernel_data.integrator.subsurface_samples;
+               float num_samples_inv = 1.0f/num_samples;
+               RNG bssrdf_rng = cmj_hash(rng, i);
+
+               /* do subsurface scatter step with copy of shader data, this will
+                * replace the BSSRDF with a diffuse BSDF closure */
+               for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+                       ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+                       float bssrdf_u, bssrdf_v;
+                       path_branched_rng_2D(kg,
+                                            &bssrdf_rng,
+                                            &branched_state->path_state,
+                                            j,
+                                            num_samples,
+                                            PRNG_BSDF_U,
+                                            &bssrdf_u,
+                                            &bssrdf_v);
+
+                       /* intersection is expensive so avoid doing multiple times for the same input */
+                       if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                               RNG lcg_state = branched_state->lcg_state;
+                               SubsurfaceIntersection ss_isect_private;
+
+                               branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+                                                                                             &ss_isect_private,
+                                                                                             sd,
+                                                                                             sc,
+                                                                                             &lcg_state,
+                                                                                             bssrdf_u, bssrdf_v,
+                                                                                             true);
+
+                               branched_state->lcg_state = lcg_state;
+                               *ss_isect = ss_isect_private;
+                       }
+
+#ifdef __VOLUME__
+                       Ray volume_ray = branched_state->ray;
+                       bool need_update_volume_stack =
+                               kernel_data.integrator.use_volumes &&
+                               sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+                       /* compute lighting with the BSDF closure */
+                       for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+                               ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+                               *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+                                                  * important as the indirect path will write into bssrdf_sd */
+
+                               SubsurfaceIntersection ss_isect_private = *ss_isect;
+                               subsurface_scatter_multi_setup(kg,
+                                                              &ss_isect_private,
+                                                              hit,
+                                                              bssrdf_sd,
+                                                              &branched_state->path_state,
+                                                              branched_state->path_state.flag,
+                                                              sc,
+                                                              true);
+                               *ss_isect = ss_isect_private;
+
+                               ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+                               *hit_state = branched_state->path_state;
+
+                               path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+                               if(need_update_volume_stack) {
+                                       /* Setup ray from previous surface point to the new one. */
+                                       float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+                                       volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+                                       /* this next part is expensive as it does scene intersection so only do once */
+                                       if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                                               for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+                                                       branched_state->volume_stack[k] = hit_state->volume_stack[k];
+                                               }
+
+                                               kernel_volume_stack_update_for_subsurface(kg,
+                                                                                         emission_sd,
+                                                                                         &volume_ray,
+                                                                                         branched_state->volume_stack);
+                                       }
+
+                                       for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+                                               hit_state->volume_stack[k] = branched_state->volume_stack[k];
+                                       }
+                               }
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+                               if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                                       /* direct light */
+                                       if(kernel_data.integrator.use_direct_light) {
+                                               int all = (kernel_data.integrator.sample_all_lights_direct) ||
+                                                             (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+                                               kernel_branched_path_surface_connect_light(kg,
+                                                                                          &rng,
+                                                                                          bssrdf_sd,
+                                                                                          emission_sd,
+                                                                                          hit_state,
+                                                                                          branched_state->throughput,
+                                                                                          num_samples_inv,
+                                                                                          L,
+                                                                                          all);
+                                       }
+                               }
+#endif  /* __EMISSION__ */
+
+                               /* indirect light */
+                               if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+                                                                                         ray_index,
+                                                                                         num_samples_inv,
+                                                                                         bssrdf_sd,
+                                                                                         false))
+                               {
+                                       branched_state->ss_next_closure = i;
+                                       branched_state->ss_next_sample = j;
+                                       branched_state->next_hit = hit;
+
+                                       return true;
+                               }
+
+                               branched_state->next_closure = 0;
+                       }
+
+                       branched_state->next_hit = 0;
+               }
+
+               branched_state->ss_next_sample = 0;
+       }
+
+       kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+       return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+       int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+       if(thread_index == 0) {
+               /* We will empty both queues in this kernel. */
+               kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+               kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
        }
-       ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        ray_index = get_ray_index(kg, ray_index,
                                  QUEUE_ACTIVE_AND_REGENERATED_RAYS,
                                  kernel_split_state.queue_data,
                                  kernel_split_params.queue_size,
-                                 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-       /* If we are executing on a GPU device, we exit all threads that are not
-        * required.
-        *
-        * If we are executing on a CPU device, then we need to keep all threads
-        * active since we have barrier() calls later in the kernel. CPU devices,
-        * expect all threads to execute barrier statement.
-        */
-       if(ray_index == QUEUE_EMPTY_SLOT) {
-               return;
-       }
-#endif
-
-       char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
-       if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+                                 1);
+       get_ray_index(kg, thread_index,
+                     QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+                     kernel_split_state.queue_data,
+                     kernel_split_params.queue_size,
+                     1);
 
+#ifdef __SUBSURFACE__
        ccl_global char *ray_state = kernel_split_state.ray_state;
        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -64,34 +228,85 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
 
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                if(sd->flag & SD_BSSRDF) {
-                       if(kernel_path_subsurface_scatter(kg,
-                                                         sd,
-                                                         emission_sd,
-                                                         L,
-                                                         state,
-                                                         &rng,
-                                                         ray,
-                                                         throughput,
-                                                         ss_indirect)) {
-                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                               enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+                       if(!kernel_data.integrator.branched) {
+#endif
+                               if(kernel_path_subsurface_scatter(kg,
+                                                                     sd,
+                                                                     emission_sd,
+                                                                     L,
+                                                                     state,
+                                                                     &rng,
+                                                                     ray,
+                                                                     throughput,
+                                                                     ss_indirect)) {
+                                       kernel_split_path_end(kg, ray_index);
+                               }
+#ifdef __BRANCHED_PATH__
+                       }
+                       else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                               float bssrdf_probability;
+                               ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+                               /* modify throughput for picking bssrdf or bsdf */
+                               *throughput *= bssrdf_probability;
+
+                               /* do bssrdf scatter step if we picked a bssrdf closure */
+                               if(sc) {
+                                       uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
+
+                                       float bssrdf_u, bssrdf_v;
+                                       path_state_rng_2D(kg,
+                                                             &rng,
+                                                             state,
+                                                             PRNG_BSDF_U,
+                                                             &bssrdf_u, &bssrdf_v);
+                                       subsurface_scatter_step(kg,
+                                                                   sd,
+                                                                   state,
+                                                                   state->flag,
+                                                                   sc,
+                                                                   &lcg_state,
+                                                                   bssrdf_u, bssrdf_v,
+                                                                   false);
+                               }
+                       }
+                       else {
+                               kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+                               if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+                               }
                        }
+#endif
                }
                kernel_split_state.rng[ray_index] = rng;
        }
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __BRANCHED_PATH__
+       if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+               kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
        }
-#endif
 
-       /* Enqueue RAY_UPDATE_BUFFER rays. */
-       enqueue_ray_index_local(ray_index,
-                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                               enqueue_flag,
-                               kernel_split_params.queue_size,
-                               local_queue_atomics,
-                               kernel_split_state.queue_data,
-                               kernel_split_params.queue_index);
+       /* iter loop */
+       ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+                                 QUEUE_SUBSURFACE_INDIRECT_ITER,
+                                 kernel_split_state.queue_data,
+                                 kernel_split_params.queue_size,
+                                 1);
+
+       if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+               /* for render passes, sum and reset indirect light pass variables
+                * for the next samples */
+               path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+               path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+               if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+               }
+       }
+#  endif  /* __BRANCHED_PATH__ */
 
 #endif  /* __SUBSURFACE__ */