Cycles: Branched path tracing for the split kernel
authorMai Lavelle <mai.lavelle@gmail.com>
Tue, 21 Mar 2017 02:31:54 +0000 (22:31 -0400)
committerMai Lavelle <mai.lavelle@gmail.com>
Tue, 2 May 2017 18:26:46 +0000 (14:26 -0400)
This implements branched path tracing for the split kernel.

General approach is to store the ray state at a branch point, trace the
branched ray as normal, then restore the state as necessary before iterating
to the next part of the path. A state machine is used to advance the indirect
loop state, which avoids the need to add any new kernels. Each iteration the
state machine recreates as much state as possible from the stored ray to keep
overall storage down.

Its kind of hard to keep all the different integration loops in sync, so this
needs lots of testing to make sure everything is working correctly. We should
probably start trying to deduplicate the integration loops more now.

Nonbranched BMW is ~2% slower, while classroom is ~2% faster, other scenes
could use more testing still.

Reviewers: sergey, nirved

Reviewed By: nirved

Subscribers: Blendify, bliblubli

Differential Revision: https://developer.blender.org/D2611

25 files changed:
intern/cycles/blender/addon/ui.py
intern/cycles/device/device_split_kernel.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_branched.h
intern/cycles/kernel/kernel_path_surface.h
intern/cycles/kernel/kernel_subsurface.h
intern/cycles/kernel/kernel_types.h
intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
intern/cycles/kernel/kernels/cuda/kernel_split.cu
intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
intern/cycles/kernel/split/kernel_branched.h [new file with mode: 0644]
intern/cycles/kernel/split/kernel_data_init.h
intern/cycles/kernel/split/kernel_direct_lighting.h
intern/cycles/kernel/split/kernel_do_volume.h
intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
intern/cycles/kernel/split/kernel_indirect_background.h
intern/cycles/kernel/split/kernel_indirect_subsurface.h
intern/cycles/kernel/split/kernel_next_iteration_setup.h
intern/cycles/kernel/split/kernel_shader_eval.h
intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
intern/cycles/kernel/split/kernel_split_common.h
intern/cycles/kernel/split/kernel_split_data_types.h
intern/cycles/kernel/split/kernel_subsurface_scatter.h

index 19ab9053e9aface74a978b746bda8ddbe213de14..ab1fe8924bc1d234f830fe20c81e19b5b6175d29 100644 (file)
@@ -78,7 +78,7 @@ def use_cuda(context):
 def use_branched_path(context):
     cscene = context.scene.cycles
 
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))
+    return (cscene.progressive == 'BRANCHED_PATH')
 
 
 def use_sample_all_lights(context):
@@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         row = layout.row()
         sub = row.row()
-        sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
         sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
index 71d52bb8097cb0007004636ab9915e5ee80fbfbc..abd26f5be4999808d4d2960563e304a4b99413ef 100644 (file)
@@ -240,6 +240,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
                                ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
index 3750225571da8d0bfacd37bd21ab4111d311fdb4..a92e8bc4aee59dfc8e16faadb88c3b5a75bc4bde 100644 (file)
@@ -235,6 +235,7 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
+       split/kernel_branched.h
        split/kernel_buffer_update.h
        split/kernel_data_init.h
        split/kernel_direct_lighting.h
index e79570421826b12c989faed586b5bf8d1a4f67b9..58da141aed34ff60523623d5257a097941fe46c5 100644 (file)
@@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ShaderData *emission_sd,
                                         PathRadiance *L,
-                                        PathState *state,
+                                        ccl_addr_space PathState *state,
                                         RNG *rng,
                                         float3 throughput,
                                         float3 ao_alpha)
@@ -98,6 +98,8 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
        }
 }
 
+#ifndef __SPLIT_KERNEL__
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
@@ -818,5 +820,7 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
        path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
 
index 085eb42325d5f6cda6d3e24be42fefce91544ba3..ddcb57161ea0a9cef99eb787fb1f8b1d0b06b1ed 100644 (file)
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *sd,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
-                                               PathState *state,
+                                               ccl_addr_space PathState *state,
                                                RNG *rng,
                                                float3 throughput)
 {
@@ -65,6 +65,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
        }
 }
 
+#ifndef __SPLIT_KERNEL__
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
@@ -648,6 +649,8 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
        path_rng_end(kg, rng_state, rng);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
index 076c82f3853a7b53a1a9fa330f1feb347f200da3..bd4ba775b4d2ee35989e6daaed608c5df851be19 100644 (file)
@@ -155,7 +155,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
         ccl_addr_space float3 *throughput,
         ccl_addr_space PathState *state,
         PathRadiance *L,
-        Ray *ray)
+        ccl_addr_space Ray *ray)
 {
        /* sample BSDF */
        float bsdf_pdf;
index baf629342b99366baa3a3184b93e2acb2d7407af..274713addc23170940930b2415d3d2d954889ab1 100644 (file)
@@ -417,9 +417,8 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
        subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }
 
-#ifndef __SPLIT_KERNEL__
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_global PathState *state,
        int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
        float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -507,7 +506,6 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
        /* setup diffuse bsdf */
        subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
-#endif /* ! __SPLIT_KERNEL__ */
 
 CCL_NAMESPACE_END
 
index b1269cdb6b414ea8ca5c44ed066dd2e41d7f3c63..6417f621c8f6956b03c8b25ef861dcf74cf5ce08 100644 (file)
@@ -71,22 +71,18 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  ifndef __SPLIT_KERNEL__
-#    define __BRANCHED_PATH__
-#  endif
+#  define __BRANCHED_PATH__
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __SUBSURFACE__
 #  define __PRINCIPLED__
+#  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
-#  ifndef __SPLIT_KERNEL__
-#    define __VOLUME_DECOUPLED__
-#    define __VOLUME_RECORD_ALL__
-#  endif
+#  define __VOLUME_DECOUPLED__
+#  define __VOLUME_RECORD_ALL__
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
@@ -138,6 +134,7 @@ CCL_NAMESPACE_BEGIN
 #    define __VOLUME_SCATTER__
 #    define __SHADOW_RECORD_ALL__
 #    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
@@ -1300,7 +1297,6 @@ typedef ccl_addr_space struct DebugData {
  * Queue 3 - Shadow ray cast kernel - AO
  * Queeu 4 - Shadow ray cast kernel - direct lighting
  */
-#define NUM_QUEUES 4
 
 /* Queue names */
 enum QueueNumber {
@@ -1313,22 +1309,37 @@ enum QueueNumber {
         * 3. Rays to be regenerated
         * are enqueued here.
         */
-       QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+       QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 
        /* All rays for which a shadow ray should be cast to determine radiance
         * contribution for AO are enqueued here.
         */
-       QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+       QUEUE_SHADOW_RAY_CAST_AO_RAYS,
 
        /* All rays for which a shadow ray should be cast to determine radiance
         * contributing for direct lighting are enqueued here.
         */
-       QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+       QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+#ifdef __BRANCHED_PATH__
+       /* All rays moving to next iteration of the indirect loop for light */
+       QUEUE_LIGHT_INDIRECT_ITER,
+#  ifdef __VOLUME__
+       /* All rays moving to next iteration of the indirect loop for volumes */
+       QUEUE_VOLUME_INDIRECT_ITER,
+#  endif
+#  ifdef __SUBSURFACE__
+       /* All rays moving to next iteration of the indirect loop for subsurface */
+       QUEUE_SUBSURFACE_INDIRECT_ITER,
+#  endif
+#endif  /* __BRANCHED_PATH__ */
+
+       NUM_QUEUES
 };
 
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
 enum RayState {
        RAY_INVALID = 0,
        /* Denotes ray is actively involved in path-iteration. */
@@ -1343,14 +1354,22 @@ enum RayState {
        RAY_TO_REGENERATE,
        /* Denotes ray has been regenerated */
        RAY_REGENERATED,
-       /* Flag's ray has to execute shadow blocked function in AO part */
-       RAY_SHADOW_RAY_CAST_AO = 16,
-       /* Flag's ray has to execute shadow blocked function in direct lighting part. */
-       RAY_SHADOW_RAY_CAST_DL = 32,
+       /* Denotes ray is moving to next iteration of the branched indirect loop */
+       RAY_LIGHT_INDIRECT_NEXT_ITER,
+       RAY_VOLUME_INDIRECT_NEXT_ITER,
+       RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+       /* Ray flags */
+
+       /* Flags to denote that the ray is currently evaluating the branched indirect loop */
+       RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+       RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+       RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+       RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
 };
 
 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
 #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
 #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
 #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
index 148b2eef56893ac652bb40012744668b933b8707..96f54bb427ef919aad6a77bb036655e66aadff8a 100644 (file)
@@ -183,7 +183,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
index a679eff840962bd0f691a320932c593d41e0174d..585b91876a9cfb186c7fb533ee1dfbc68b9090d5 100644 (file)
@@ -110,7 +110,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
index 7a1838e485f4ca4743a802cb69aac67f3df72735..99b74a1802b5dec45cb27c5f4d9bbbff59077589 100644 (file)
@@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_subsurface_scatter(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       ccl_local unsigned int local_queue_atomics;
-       kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
+       kernel_subsurface_scatter((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644 (file)
index 0000000..c7bc1b4
--- /dev/null
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       /* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+               branched_state->name = kernel_split_state.name[ray_index];
+
+       BRANCHED_STORE(path_state);
+       BRANCHED_STORE(throughput);
+       BRANCHED_STORE(ray);
+       BRANCHED_STORE(sd);
+       BRANCHED_STORE(isect);
+       BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+       /* set loop counters to intial position */
+       branched_state->next_closure = 0;
+       branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       /* restore state */
+#define BRANCHED_RESTORE(name) \
+               kernel_split_state.name[ray_index] = branched_state->name;
+
+       BRANCHED_RESTORE(path_state);
+       BRANCHED_RESTORE(throughput);
+       BRANCHED_RESTORE(ray);
+       BRANCHED_RESTORE(sd);
+       BRANCHED_RESTORE(isect);
+       BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+       /* leave indirect loop */
+       REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+                                                                                int ray_index,
+                                                                                float num_samples_adjust,
+                                                                                ShaderData *saved_sd,
+                                                                                bool reset_path_state)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       ShaderData *sd = saved_sd;
+       RNG rng = kernel_split_state.rng[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       float3 throughput = branched_state->throughput;
+
+       for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+               const ShaderClosure *sc = &sd->closure[i];
+
+               if(!CLOSURE_IS_BSDF(sc->type))
+                       continue;
+               /* transparency is not handled here, but in outer loop */
+               if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+                       continue;
+
+               int num_samples;
+
+               if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+                       num_samples = kernel_data.integrator.diffuse_samples;
+               else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+                       num_samples = 1;
+               else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+                       num_samples = kernel_data.integrator.glossy_samples;
+               else
+                       num_samples = kernel_data.integrator.transmission_samples;
+
+               num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+               float num_samples_inv = num_samples_adjust/num_samples;
+               RNG bsdf_rng = cmj_hash(rng, i);
+
+               for(int j = branched_state->next_sample; j < num_samples; j++) {
+                       ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+                       if(reset_path_state) {
+                               *ps = branched_state->path_state;
+                       }
+
+                       ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+                       *tp = throughput;
+
+                       ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+                       if(!kernel_branched_path_surface_bounce(kg,
+                                                               &bsdf_rng,
+                                                               sd,
+                                                               sc,
+                                                               j,
+                                                               num_samples,
+                                                               tp,
+                                                               ps,
+                                                               L,
+                                                               bsdf_ray))
+                       {
+                               continue;
+                       }
+
+                       /* update state for next iteration */
+                       branched_state->next_closure = i;
+                       branched_state->next_sample = j+1;
+                       branched_state->num_samples = num_samples;
+
+                       /* start the indirect path */
+                       *tp *= num_samples_inv;
+
+                       return true;
+               }
+
+               branched_state->next_sample = 0;
+       }
+
+       return false;
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
index 9d3d01fff75c9e4636985199555c2b9b1feb2155..642ccac823905cee2c5a2b9ba06adb77f8e5561a 100644 (file)
@@ -105,21 +105,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 
        /* Initialize queue data and queue index. */
        if(thread_index < queuesize) {
-               /* Initialize active ray queue. */
-               kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-               /* Initialize background and buffer update queue. */
-               kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-               /* Initialize shadow ray cast of AO queue. */
-               kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-               /* Initialize shadow ray cast of direct lighting queue. */
-               kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+               for(int i = 0; i < NUM_QUEUES; i++) {
+                       kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+               }
        }
 
        if(thread_index == 0) {
-               Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-               Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-               Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-               Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+               for(int i = 0; i < NUM_QUEUES; i++) {
+                       Queue_index[i] = 0;
+               }
+
                /* The scene-intersect kernel should not use the queues very first time.
                 * since the queue would be empty.
                 */
index bdbf7387b954adac7cfd99a1249c2c496aab7442..3336c968a44b472dd5daecfcc0ee4a0cec6e1f27 100644 (file)
@@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
                                  kernel_split_params.queue_size,
                                  0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-       /* If we are executing on a GPU device, we exit all threads that are not
-        * required.
-        *
-        * If we are executing on a CPU device, then we need to keep all threads
-        * active since we have barrier() calls later in the kernel. CPU devices,
-        * expect all threads to execute barrier statement.
-        */
-       if(ray_index == QUEUE_EMPTY_SLOT) {
-               return;
-       }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-       if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
        if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
                ShaderData *sd = &kernel_split_state.sd[ray_index];
@@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
                /* direct lighting */
 #ifdef __EMISSION__
                RNG rng = kernel_split_state.rng[ray_index];
+
                bool flag = (kernel_data.integrator.use_direct_light &&
                             (sd->flag & SD_BSDF_HAS_EVAL));
+
+#  ifdef __BRANCHED_PATH__
+               if(flag && kernel_data.integrator.branched) {
+                       flag = false;
+                       enqueue_flag = 1;
+               }
+#  endif  /* __BRANCHED_PATH__ */
+
 #  ifdef __SHADOW_TRICKS__
                if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
                        flag = false;
-                       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
-                       float3 throughput = kernel_split_state.throughput[ray_index];
-                       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-                       kernel_branched_path_surface_connect_light(kg,
-                                                                  &rng,
-                                                                  sd,
-                                                                  emission_sd,
-                                                                  state,
-                                                                  throughput,
-                                                                  1.0f,
-                                                                  L,
-                                                                  1);
+                       enqueue_flag = 1;
                }
 #  endif  /* __SHADOW_TRICKS__ */
+
                if(flag) {
                        /* Sample illumination from lights to find path contribution. */
                        float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
@@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
                                        kernel_split_state.bsdf_eval[ray_index] = L_light;
                                        kernel_split_state.is_lamp[ray_index] = is_lamp;
                                        /* Mark ray state for next shadow kernel. */
-                                       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
                                        enqueue_flag = 1;
                                }
                        }
@@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 #endif  /* __EMISSION__ */
        }
 
-#ifndef __COMPUTE_DEVICE_GPU__
-       }
-#endif
-
 #ifdef __EMISSION__
        /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
        enqueue_ray_index_local(ray_index,
@@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
 #endif
+
+#ifdef __BRANCHED_PATH__
+       /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+        * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+        */
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+               *local_queue_atomics = 0;
+       }
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+       ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+       enqueue_ray_index_local(ray_index,
+                               QUEUE_LIGHT_INDIRECT_ITER,
+                               IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+                               kernel_split_params.queue_size,
+                               local_queue_atomics,
+                               kernel_split_state.queue_data,
+                               kernel_split_params.queue_index);
+
+#endif  /* __BRANCHED_PATH__ */
 }
 
 CCL_NAMESPACE_END
index 47d3c2808317bd8807b30fa7d45dd95f2cf214ec..182e6c6e4fac21be00f1d545d08e5705c0fae327 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+       kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       RNG rng = kernel_split_state.rng[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+       /* GPU: no decoupled ray marching, scatter probalistically */
+       int num_samples = kernel_data.integrator.volume_samples;
+       float num_samples_inv = 1.0f/num_samples;
+
+       Ray volume_ray = branched_state->ray;
+       volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+       bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+       for(int j = branched_state->next_sample; j < num_samples; j++) {
+               ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+               *ps = branched_state->path_state;
+
+               ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+               *pray = branched_state->ray;
+
+               ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+               *tp = branched_state->throughput * num_samples_inv;
+
+               /* branch RNG state */
+               path_state_branch(ps, j, num_samples);
+
+               /* integrate along volume segment with distance sampling */
+               VolumeIntegrateResult result = kernel_volume_integrate(
+                       kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+               if(result == VOLUME_PATH_SCATTERED) {
+                       /* direct lighting */
+                       kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+                       /* indirect light bounce */
+                       if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+                               continue;
+                       }
+
+                       /* start the indirect path */
+                       branched_state->next_closure = 0;
+                       branched_state->next_sample = j+1;
+                       branched_state->num_samples = num_samples;
+
+                       return true;
+               }
+#  endif
+       }
+
+       kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+       /* todo: avoid this calculation using decoupled ray marching */
+       float3 throughput = kernel_split_state.throughput[ray_index];
+       kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+       kernel_split_state.throughput[ray_index] = throughput;
+
+       return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __VOLUME__ */
 
 ccl_device void kernel_do_volume(KernelGlobals *kg)
 {
@@ -23,37 +98,37 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
        /* We will empty this queue in this kernel. */
        if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
                kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+#  ifdef __BRANCHED_PATH__
+               kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+#  endif  /* __BRANCHED_PATH__ */
        }
-       /* Fetch use_queues_flag. */
-       char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-       ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-       if(local_use_queues_flag) {
+
+       if(*kernel_split_params.use_queues_flag) {
                ray_index = get_ray_index(kg, ray_index,
                                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
                                          kernel_split_state.queue_data,
                                          kernel_split_params.queue_size,
                                          1);
-               if(ray_index == QUEUE_EMPTY_SLOT) {
-                       return;
-               }
        }
 
-       if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-          IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+       ccl_global char *ray_state = kernel_split_state.ray_state;
 
-               bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-               PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+       ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+       ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+       RNG rng = kernel_split_state.rng[ray_index];
+       ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
-               ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-               ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-               RNG rng = kernel_split_state.rng[ray_index];
-               ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-               ShaderData *sd = &kernel_split_state.sd[ray_index];
-               ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+       if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+          IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+
+               bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
 
                /* Sanitize volume stack. */
                if(!hit) {
@@ -64,31 +139,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
                        Ray volume_ray = *ray;
                        volume_ray.t = (hit)? isect->t: FLT_MAX;
 
-                       bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+#  ifdef __BRANCHED_PATH__
+                       if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#  endif  /* __BRANCHED_PATH__ */
+                               bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 
-                       {
-                               /* integrate along volume segment with distance sampling */
-                               VolumeIntegrateResult result = kernel_volume_integrate(
-                                       kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+                               {
+                                       /* integrate along volume segment with distance sampling */
+                                       VolumeIntegrateResult result = kernel_volume_integrate(
+                                               kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
-                               if(result == VOLUME_PATH_SCATTERED) {
-                                       /* direct lighting */
-                                       kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
-
-                                       /* indirect light bounce */
-                                       if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
-                                               ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
-                                       else
-                                               ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+                                       if(result == VOLUME_PATH_SCATTERED) {
+                                               /* direct lighting */
+                                               kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+
+                                               /* indirect light bounce */
+                                               if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+                                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+                                               }
+                                               else {
+                                                       kernel_split_path_end(kg, ray_index);
+                                               }
+                                       }
+#  endif  /* __VOLUME_SCATTER__ */
+                               }
+
+#  ifdef __BRANCHED_PATH__
+                       }
+                       else {
+                               kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+                               if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
                                }
-#  endif
                        }
+#  endif  /* __BRANCHED_PATH__ */
                }
+
                kernel_split_state.rng[ray_index] = rng;
        }
 
-#endif
+#  ifdef __BRANCHED_PATH__
+       /* iter loop */
+       ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+                                 QUEUE_VOLUME_INDIRECT_ITER,
+                                 kernel_split_state.queue_data,
+                                 kernel_split_params.queue_size,
+                                 1);
+
+       if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+               /* for render passes, sum and reset indirect light pass variables
+                * for the next samples */
+               path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+               path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+               if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+               }
+       }
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __VOLUME__ */
 }
 
 
index 89adeb64c8a8de65d2243f155488c5486c0c767a..87498910d38177738ca0a713a0967bf67e063924 100644 (file)
@@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
  *     flag RAY_SHADOW_RAY_CAST_AO
  */
+
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         KernelGlobals *kg,
         ccl_local_param BackgroundAOLocals *locals)
@@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
+#ifdef __AO__
        char enqueue_flag = 0;
-       char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+#endif
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        ray_index = get_ray_index(kg, ray_index,
                                  QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -155,8 +157,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
                                kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
                        }
                        if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                               enqueue_flag = 1;
+                               kernel_split_path_end(kg, ray_index);
                        }
                }
 #endif  /* __HOLDOUT__ */
@@ -164,18 +165,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-               /* Holdout mask objects do not write data passes. */
-               kernel_write_data_passes(kg,
-                                        buffer,
-                                        L,
-                                        sd,
-                                        sample,
-                                        state,
-                                        throughput);
+
+#ifdef __BRANCHED_PATH__
+               if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
+#endif  /* __BRANCHED_PATH__ */
+               {
+                       /* Holdout mask objects do not write data passes. */
+                       kernel_write_data_passes(kg,
+                                                    buffer,
+                                                    L,
+                                                    sd,
+                                                    sample,
+                                                    state,
+                                                    throughput);
+               }
+
                /* Blurring of bsdf after bounces, for rays that have a small likelihood
                 * of following this particular path (diffuse, rough glossy.
                 */
-               if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+#ifndef __BRANCHED_PATH__
+               if(kernel_data.integrator.filter_glossy != FLT_MAX)
+#else
+               if(kernel_data.integrator.filter_glossy != FLT_MAX &&
+                  (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
+#endif  /* __BRANCHED_PATH__ */
+               {
                        float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
                        if(blur_pdf < 1.0f) {
                                float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
@@ -201,19 +215,32 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
                 * mainly due to the mixed in MIS that we use. gives too many unneeded
                 * shader evaluations, only need emission if we are going to terminate.
                 */
+#ifndef __BRANCHED_PATH__
                float probability = path_state_terminate_probability(kg, state, throughput);
+#else
+               float probability = 1.0f;
+
+               if(!kernel_data.integrator.branched) {
+                       probability = path_state_terminate_probability(kg, state, throughput);
+               }
+               else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                       int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
+                       probability = path_state_terminate_probability(kg, state, throughput*num_samples);
+               }
+               else if(state->flag & PATH_RAY_TRANSPARENT) {
+                       probability = path_state_terminate_probability(kg, state, throughput);
+               }
+#endif
 
                if(probability == 0.0f) {
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                       enqueue_flag = 1;
+                       kernel_split_path_end(kg, ray_index);
                }
 
                if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                        if(probability != 1.0f) {
                                float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
                                if(terminate >= probability) {
-                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                                       enqueue_flag = 1;
+                                       kernel_split_path_end(kg, ray_index);
                                }
                                else {
                                        kernel_split_state.throughput[ray_index] = throughput/probability;
@@ -225,61 +252,23 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 #ifdef __AO__
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                /* ambient occlusion */
-               if(kernel_data.integrator.use_ambient_occlusion ||
-                  (sd->flag & SD_AO))
-               {
-                       /* todo: solve correlation */
-                       float bsdf_u, bsdf_v;
-                       path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-                       float ao_factor = kernel_data.background.ao_factor;
-                       float3 ao_N;
-                       kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-                       kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
-
-                       float3 ao_D;
-                       float ao_pdf;
-                       sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-                       if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-                               Ray _ray;
-                               _ray.P = ray_offset(sd->P, sd->Ng);
-                               _ray.D = ao_D;
-                               _ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-                               _ray.time = sd->time;
-#endif
-                               _ray.dP = sd->dP;
-                               _ray.dD = differential3_zero();
-                               kernel_split_state.ao_light_ray[ray_index] = _ray;
-
-                               ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-                               enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-                       }
+               if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+                       enqueue_flag = 1;
                }
        }
 #endif  /* __AO__ */
-       kernel_split_state.rng[ray_index] = rng;
 
+       kernel_split_state.rng[ray_index] = rng;
 
 #ifndef __COMPUTE_DEVICE_GPU__
        }
 #endif
 
-       /* Enqueue RAY_UPDATE_BUFFER rays. */
-       enqueue_ray_index_local(ray_index,
-                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                               enqueue_flag,
-                               kernel_split_params.queue_size,
-                               &locals->queue_atomics_bg,
-                               kernel_split_state.queue_data,
-                               kernel_split_params.queue_index);
-
 #ifdef __AO__
        /* Enqueue to-shadow-ray-cast rays. */
        enqueue_ray_index_local(ray_index,
                                QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                               enqueue_flag_AO_SHADOW_RAY_CAST,
+                               enqueue_flag,
                                kernel_split_params.queue_size,
                                &locals->queue_atomics_ao,
                                kernel_split_state.queue_data,
index 8192528622e2f387c95a59db7706620db23231b4..6fbc888e358f9006c064ac4c6684e9279c5e31f8 100644 (file)
@@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
                        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
                                if(state->bounce > kernel_data.integrator.ao_bounces) {
-                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                                       kernel_split_path_end(kg, ray_index);
                                }
                        }
                }
@@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 #ifdef __PASSES__
                        if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
 #endif
-                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                               kernel_split_path_end(kg, ray_index);
                }
 
                if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
@@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
                        float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
                        path_radiance_accum_background(L, state, (*throughput), L_background);
 #endif
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                       kernel_split_path_end(kg, ray_index);
                }
        }
 
index a56e85abeb90173f0a96c414097174aa58901cc0..82bc2f01fd7f0f42c0b0c58038dad6a6d01af92c 100644 (file)
@@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
        ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
        ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 
-       if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-               ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-               kernel_path_subsurface_accum_indirect(ss_indirect, L);
+#ifdef __BRANCHED_PATH__
+       if(!kernel_data.integrator.branched) {
+#endif
+               if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+                       ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+                       kernel_path_subsurface_accum_indirect(ss_indirect, L);
 
-               /* Trace indirect subsurface rays by restarting the loop. this uses less
-                * stack memory than invoking kernel_path_indirect.
-                */
-               if(ss_indirect->num_rays) {
-                       kernel_path_subsurface_setup_indirect(kg,
-                                                             ss_indirect,
-                                                             state,
-                                                             ray,
-                                                             L,
-                                                             throughput);
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-               }
-               else {
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                       /* Trace indirect subsurface rays by restarting the loop. this uses less
+                        * stack memory than invoking kernel_path_indirect.
+                        */
+                       if(ss_indirect->num_rays) {
+                               kernel_path_subsurface_setup_indirect(kg,
+                                                                         ss_indirect,
+                                                                         state,
+                                                                         ray,
+                                                                         L,
+                                                                         throughput);
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+                       }
                }
+#ifdef __BRANCHED_PATH__
        }
+#endif
 
 #endif  /* __SUBSURFACE__ */
 
index 1bebc16e25bbb68ad4b9b143b91effe5f654b6c0..71017fed19e68fb0522892d04db4a4e3aa483488 100644 (file)
@@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
  *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+       kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+       kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+       ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+       ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+       /* continue in case of transparency */
+       *throughput *= shader_bsdf_transparency(kg, sd);
+
+       if(is_zero(*throughput)) {
+               kernel_split_path_end(kg, ray_index);
+       }
+       else {
+               /* Update Path State */
+               state->flag |= PATH_RAY_TRANSPARENT;
+               state->transparent_bounce++;
+
+               ray->P = ray_offset(sd->P, -sd->Ng);
+               ray->t -= sd->ray_length; /* clipping works through transparent */
+
+#  ifdef __RAY_DIFFERENTIALS__
+               ray->dP = sd->dP;
+               ray->dD.dx = -sd->dI.dx;
+               ray->dD.dy = -sd->dI.dy;
+#  endif  /* __RAY_DIFFERENTIALS__ */
+
+#  ifdef __VOLUME__
+               /* enter/exit volume */
+               kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#  endif  /* __VOLUME__ */
+       }
+}
+#endif  /* __BRANCHED_PATH__ */
+
 ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                                             ccl_local_param unsigned int *local_queue_atomics)
 {
@@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
        }
 
-       char enqueue_flag = 0;
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        ray_index = get_ray_index(kg, ray_index,
                                  QUEUE_ACTIVE_AND_REGENERATED_RAYS,
@@ -75,102 +120,125 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
                                  kernel_split_params.queue_size,
                                  0);
 
-#ifdef __COMPUTE_DEVICE_GPU__
-       /* If we are executing on a GPU device, we exit all threads that are not
-        * required.
-        *
-        * If we are executing on a CPU device, then we need to keep all threads
-        * active since we have barrier() calls later in the kernel. CPU devices,
-        * expect all threads to execute barrier statement.
-        */
-       if(ray_index == QUEUE_EMPTY_SLOT) {
-               return;
-       }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-       if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-
-       /* Load ShaderData structure. */
-       PathRadiance *L = NULL;
-       ccl_global PathState *state = NULL;
        ccl_global char *ray_state = kernel_split_state.ray_state;
 
-       /* Path radiance update for AO/Direct_lighting's shadow blocked. */
-       if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-          IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-       {
-               state = &kernel_split_state.path_state[ray_index];
-               L = &kernel_split_state.path_radiance[ray_index];
-               float3 _throughput = kernel_split_state.throughput[ray_index];
-
-               if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-                       float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
-                       // TODO(mai): investigate correctness here
-                       char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
-                       if(update_path_radiance) {
-                               path_radiance_accum_ao(L,
-                                                      _throughput,
-                                                      kernel_split_state.ao_alpha[ray_index],
-                                                      kernel_split_state.ao_bsdf[ray_index],
-                                                      shadow,
-                                                      state->bounce);
-                       }
-                       else {
-                               path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
+       bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+       if(active) {
+               ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+               ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+               RNG rng = kernel_split_state.rng[ray_index];
+               ShaderData *sd = &kernel_split_state.sd[ray_index];
+               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+               PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+               if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+                       /* Compute direct lighting and next bounce. */
+                       if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+                               kernel_split_path_end(kg, ray_index);
                        }
-                       REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
                }
-
-               if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-                       float3 shadow = kernel_split_state.light_ray[ray_index].P;
-                       // TODO(mai): investigate correctness here
-                       char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
-                       BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-                       if(update_path_radiance) {
-                               path_radiance_accum_light(L,
-                                                         _throughput,
-                                                         &L_light,
-                                                         shadow,
-                                                         1.0f,
-                                                         state->bounce,
-                                                         kernel_split_state.is_lamp[ray_index]);
+               else {
+                       kernel_split_branched_indirect_light_init(kg, ray_index);
+
+                       if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+                                                                                 ray_index,
+                                                                                 1.0f,
+                                                                                 &kernel_split_state.branched_state[ray_index].sd,
+                                                                                 true))
+                       {
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
                        }
                        else {
-                               path_radiance_accum_total_light(L, _throughput, &L_light);
+                               kernel_split_branched_indirect_light_end(kg, ray_index);
                        }
-                       REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
                }
+#endif  /* __BRANCHED_PATH__ */
+
+               kernel_split_state.rng[ray_index] = rng;
        }
 
-       if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-               ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-               ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-               RNG rng = kernel_split_state.rng[ray_index];
-               state = &kernel_split_state.path_state[ray_index];
-               L = &kernel_split_state.path_radiance[ray_index];
+       /* Enqueue RAY_UPDATE_BUFFER rays. */
+       enqueue_ray_index_local(ray_index,
+                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+                               IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+                               kernel_split_params.queue_size,
+                               local_queue_atomics,
+                               kernel_split_state.queue_data,
+                               kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+       /* iter loop */
+       if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+               kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+       }
 
-               /* Compute direct lighting and next bounce. */
-               if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
-                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                       enqueue_flag = 1;
+       ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+                                 QUEUE_LIGHT_INDIRECT_ITER,
+                                 kernel_split_state.queue_data,
+                                 kernel_split_params.queue_size,
+                                 1);
+
+       if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+               /* for render passes, sum and reset indirect light pass variables
+                * for the next samples */
+               PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+               path_radiance_sum_indirect(L);
+               path_radiance_reset_indirect(L);
+
+               if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+                                                                         ray_index,
+                                                                         1.0f,
+                                                                         &kernel_split_state.branched_state[ray_index].sd,
+                                                                         true))
+               {
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+               }
+               else {
+                       kernel_split_branched_indirect_light_end(kg, ray_index);
                }
-               kernel_split_state.rng[ray_index] = rng;
        }
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __VOLUME__
+       /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+               *local_queue_atomics = 0;
        }
-#endif
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-       /* Enqueue RAY_UPDATE_BUFFER rays. */
+       ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        enqueue_ray_index_local(ray_index,
-                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                               enqueue_flag,
+                               QUEUE_VOLUME_INDIRECT_ITER,
+                               IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+                               kernel_split_params.queue_size,
+                               local_queue_atomics,
+                               kernel_split_state.queue_data,
+                               kernel_split_params.queue_index);
+
+#  endif  /* __VOLUME__ */
+
+#  ifdef __SUBSURFACE__
+       /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+               *local_queue_atomics = 0;
+       }
+       ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+       ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+       enqueue_ray_index_local(ray_index,
+                               QUEUE_SUBSURFACE_INDIRECT_ITER,
+                               IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
                                kernel_split_params.queue_size,
                                local_queue_atomics,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
+#  endif  /* __SUBSURFACE__ */
+#endif  /* __BRANCHED_PATH__ */
 }
 
 CCL_NAMESPACE_END
index 0f1696e34a05c287bba7523a1cb6d72e43713a4e..957d70958d935730cfe1f4e4aa9703223f511899 100644 (file)
@@ -38,8 +38,10 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
                                  kernel_split_params.queue_size,
                                  0);
 
+       ccl_global char *ray_state = kernel_split_state.ray_state;
+
        char enqueue_flag = 0;
-       if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
+       if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
                enqueue_flag = 1;
        }
 
@@ -52,7 +54,7 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
                                kernel_split_params.queue_index);
 
        /* Continue on with shader evaluation. */
-       if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+       if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                Intersection isect = kernel_split_state.isect[ray_index];
                RNG rng = kernel_split_state.rng[ray_index];
                ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
@@ -62,8 +64,27 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg,
                                      &kernel_split_state.sd[ray_index],
                                      &isect,
                                      &ray);
+
+#ifndef __BRANCHED_PATH__
                float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
                shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+#else
+               ShaderContext ctx = SHADER_CONTEXT_MAIN;
+               float rbsdf = 0.0f;
+
+               if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                       rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+
+               }
+
+               if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                       ctx = SHADER_CONTEXT_INDIRECT;
+               }
+
+               shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
+               shader_merge_closures(&kernel_split_state.sd[ray_index]);
+#endif  /* __BRANCHED_PATH__ */
+
                kernel_split_state.rng[ray_index] = rng;
        }
 }
index 4243e18de72f46a74f23b50f53c1a11b77b5bb5b..474286285a9c9fd484ad4629ab52af07ba7decc0 100644 (file)
@@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
                                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
        }
 
-       if(ray_index == QUEUE_EMPTY_SLOT)
+       if(ray_index == QUEUE_EMPTY_SLOT) {
                return;
+       }
 
-       /* Flag determining if we need to update L. */
-       char update_path_radiance = 0;
-
-       if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-               ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
-
-               float3 shadow;
-               Ray ray = *light_ray_global;
-               update_path_radiance = !(shadow_blocked(kg,
-                                                       &kernel_split_state.sd_DL_shadow[ray_index],
-                                                       state,
-                                                       &ray,
-                                                       &shadow));
-
-               *light_ray_global = ray;
-               /* We use light_ray_global's P and t to store shadow and
-                * update_path_radiance.
-                */
-               light_ray_global->P = shadow;
-               light_ray_global->t = update_path_radiance;
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+       RNG rng = kernel_split_state.rng[ray_index];
+       float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+       if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+               kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+       }
+       else {
+               kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
        }
+#endif
+
+       kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
index bb8f01579653eebf29f822fff6ae1530d9a8b34f..452b6e45a36c1994696deaa05f440ad9c4c56478 100644 (file)
@@ -32,28 +32,71 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
        if(ray_index == QUEUE_EMPTY_SLOT)
                return;
 
-       /* Flag determining if we need to update L. */
-       char update_path_radiance = 0;
+       ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+       Ray ray = kernel_split_state.light_ray[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ShaderData *sd = &kernel_split_state.sd[ray_index];
+       float3 throughput = kernel_split_state.throughput[ray_index];
+       RNG rng = kernel_split_state.rng[ray_index];
 
-       if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-               ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-               ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+       BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+       bool is_lamp = kernel_split_state.is_lamp[ray_index];
 
+#  if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+       bool use_branched = false;
+       int all = 0;
+
+       if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+               use_branched = true;
+               all = 1;
+       }
+#    if defined(__BRANCHED_PATH__)
+       else if(kernel_data.integrator.branched) {
+               use_branched = true;
+
+               if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                       all = (kernel_data.integrator.sample_all_lights_indirect);
+               }
+               else
+               {
+                       all = (kernel_data.integrator.sample_all_lights_direct);
+               }
+       }
+#    endif  /* __BRANCHED_PATH__ */
+
+       if(use_branched) {
+               kernel_branched_path_surface_connect_light(kg,
+                                                          &rng,
+                                                          sd,
+                                                          emission_sd,
+                                                          state,
+                                                          throughput,
+                                                          1.0f,
+                                                          L,
+                                                          all);
+       }
+       else
+#  endif  /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+       {
+               /* trace shadow ray */
                float3 shadow;
-               Ray ray = *light_ray_global;
-               update_path_radiance = !(shadow_blocked(kg,
-                                                       &kernel_split_state.sd_DL_shadow[ray_index],
-                                                       state,
-                                                       &ray,
-                                                       &shadow));
-
-               *light_ray_global = ray;
-               /* We use light_ray_global's P and t to store shadow and
-                * update_path_radiance.
-                */
-               light_ray_global->P = shadow;
-               light_ray_global->t = update_path_radiance;
+
+               if(!shadow_blocked(kg,
+                                      emission_sd,
+                                      state,
+                                      &ray,
+                                      &shadow))
+               {
+                       /* accumulate */
+                       path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+               }
+               else {
+                       path_radiance_accum_total_light(L, throughput, &L_light);
+               }
        }
+
+       kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
index 4303ba0a905845df82b10712ff89c09b167ffa64..57f070d51e0ce891787b88dd6adb5c27c5a97ac6 100644 (file)
 
 #include "util/util_atomic.h"
 
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/kernel_path_branched.h"
 #endif
 
-#ifdef __VOLUME__
-#  include "kernel/kernel_volume.h"
-#endif
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
 
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/split/kernel_branched.h"
+#endif
 
-#ifdef __KERNEL_DEBUG__
-#  include "kernel/kernel_debug.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+       ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+       if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+       }
+       else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+       }
+       else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+       }
+       else {
+               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+       }
+#else
+       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 #endif
+}
 
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_H__ */
index 0af8bfc89d5a5136152eac51f5ad9456ae29877f..a2cb4f6ae98d208566468117baeb1a046b74b0b3 100644 (file)
@@ -62,7 +62,46 @@ typedef struct SplitParams {
        SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
 #else
 #  define SPLIT_DATA_DEBUG_ENTRIES
-#endif
+#endif  /* DEBUG */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+       /* various state that must be kept and restored after an indirect loop */
+       PathState path_state;
+       float3 throughput;
+       Ray ray;
+
+       struct ShaderData sd;
+       Intersection isect;
+
+       char ray_state;
+
+       /* indirect loop state */
+       int next_closure;
+       int next_sample;
+       int num_samples;
+
+#ifdef __SUBSURFACE__
+       int ss_next_closure;
+       int ss_next_sample;
+       int next_hit;
+       int num_hits;
+
+       uint lcg_state;
+       SubsurfaceIntersection ss_isect;
+
+#  ifdef __VOLUME__
+       VolumeStack volume_stack[VOLUME_STACK_SIZE];
+#  endif  /* __VOLUME__ */
+#endif  /*__SUBSURFACE__ */
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+       SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif  /* __BRANCHED_PATH__ */
 
 #define SPLIT_DATA_ENTRIES \
        SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
@@ -72,9 +111,6 @@ typedef struct SplitParams {
        SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
        SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
        SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-       SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
-       SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
-       SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
        SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
        SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
        SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
@@ -82,6 +118,7 @@ typedef struct SplitParams {
        SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
        SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
        SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+       SPLIT_DATA_BRANCHED_ENTRIES \
        SPLIT_DATA_DEBUG_ENTRIES \
 
 /* struct that holds pointers to data in the shared state buffer */
index 0b4d50c70eea713521ed0c2f5e1c1fb0047027b5..8364f185d75b97cb41e78e385af35f9bed1cf74e 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
 
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
-                                          ccl_local_param unsigned int* local_queue_atomics)
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
 {
-#ifdef __SUBSURFACE__
-       if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               *local_queue_atomics = 0;
+       kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       branched_state->ss_next_closure = 0;
+       branched_state->ss_next_sample = 0;
+
+       branched_state->num_hits = 0;
+       branched_state->next_hit = 0;
+
+       ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+       SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+       ShaderData *sd = &branched_state->sd;
+       RNG rng = kernel_split_state.rng[ray_index];
+       PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+       ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+       for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+               ShaderClosure *sc = &sd->closure[i];
+
+               if(!CLOSURE_IS_BSSRDF(sc->type))
+                       continue;
+
+               /* set up random number generator */
+               if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+                  branched_state->next_closure == 0 && branched_state->next_sample == 0)
+               {
+                       branched_state->lcg_state = lcg_state_init(&rng,
+                                                                  branched_state->path_state.rng_offset,
+                                                                  branched_state->path_state.sample,
+                                                                  0x68bc21eb);
+               }
+               int num_samples = kernel_data.integrator.subsurface_samples;
+               float num_samples_inv = 1.0f/num_samples;
+               RNG bssrdf_rng = cmj_hash(rng, i);
+
+               /* do subsurface scatter step with copy of shader data, this will
+                * replace the BSSRDF with a diffuse BSDF closure */
+               for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+                       ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+                       float bssrdf_u, bssrdf_v;
+                       path_branched_rng_2D(kg,
+                                            &bssrdf_rng,
+                                            &branched_state->path_state,
+                                            j,
+                                            num_samples,
+                                            PRNG_BSDF_U,
+                                            &bssrdf_u,
+                                            &bssrdf_v);
+
+                       /* intersection is expensive so avoid doing multiple times for the same input */
+                       if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                               RNG lcg_state = branched_state->lcg_state;
+                               SubsurfaceIntersection ss_isect_private;
+
+                               branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+                                                                                             &ss_isect_private,
+                                                                                             sd,
+                                                                                             sc,
+                                                                                             &lcg_state,
+                                                                                             bssrdf_u, bssrdf_v,
+                                                                                             true);
+
+                               branched_state->lcg_state = lcg_state;
+                               *ss_isect = ss_isect_private;
+                       }
+
+#ifdef __VOLUME__
+                       Ray volume_ray = branched_state->ray;
+                       bool need_update_volume_stack =
+                               kernel_data.integrator.use_volumes &&
+                               sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+                       /* compute lighting with the BSDF closure */
+                       for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+                               ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+                               *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+                                                  * important as the indirect path will write into bssrdf_sd */
+
+                               SubsurfaceIntersection ss_isect_private = *ss_isect;
+                               subsurface_scatter_multi_setup(kg,
+                                                              &ss_isect_private,
+                                                              hit,
+                                                              bssrdf_sd,
+                                                              &branched_state->path_state,
+                                                              branched_state->path_state.flag,
+                                                              sc,
+                                                              true);
+                               *ss_isect = ss_isect_private;
+
+                               ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+                               *hit_state = branched_state->path_state;
+
+                               path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+                               if(need_update_volume_stack) {
+                                       /* Setup ray from previous surface point to the new one. */
+                                       float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+                                       volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+                                       /* this next part is expensive as it does scene intersection so only do once */
+                                       if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                                               for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+                                                       branched_state->volume_stack[k] = hit_state->volume_stack[k];
+                                               }
+
+                                               kernel_volume_stack_update_for_subsurface(kg,
+                                                                                         emission_sd,
+                                                                                         &volume_ray,
+                                                                                         branched_state->volume_stack);
+                                       }
+
+                                       for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+                                               hit_state->volume_stack[k] = branched_state->volume_stack[k];
+                                       }
+                               }
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+                               if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+                                       /* direct light */
+                                       if(kernel_data.integrator.use_direct_light) {
+                                               int all = (kernel_data.integrator.sample_all_lights_direct) ||
+                                                             (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+                                               kernel_branched_path_surface_connect_light(kg,
+                                                                                          &rng,
+                                                                                          bssrdf_sd,
+                                                                                          emission_sd,
+                                                                                          hit_state,
+                                                                                          branched_state->throughput,
+                                                                                          num_samples_inv,
+                                                                                          L,
+                                                                                          all);
+                                       }
+                               }
+#endif  /* __EMISSION__ */
+
+                               /* indirect light */
+                               if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+                                                                                         ray_index,
+                                                                                         num_samples_inv,
+                                                                                         bssrdf_sd,
+                                                                                         false))
+                               {
+                                       branched_state->ss_next_closure = i;
+                                       branched_state->ss_next_sample = j;
+                                       branched_state->next_hit = hit;
+
+                                       return true;
+                               }
+
+                               branched_state->next_closure = 0;
+                       }
+
+                       branched_state->next_hit = 0;
+               }
+
+               branched_state->ss_next_sample = 0;
+       }
+
+       kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+       return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+       int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+       if(thread_index == 0) {
+               /* We will empty both queues in this kernel. */
+               kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+               kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
        }
-       ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
        ray_index = get_ray_index(kg, ray_index,
                                  QUEUE_ACTIVE_AND_REGENERATED_RAYS,
                                  kernel_split_state.queue_data,
                                  kernel_split_params.queue_size,
-                                 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-       /* If we are executing on a GPU device, we exit all threads that are not
-        * required.
-        *
-        * If we are executing on a CPU device, then we need to keep all threads
-        * active since we have barrier() calls later in the kernel. CPU devices,
-        * expect all threads to execute barrier statement.
-        */
-       if(ray_index == QUEUE_EMPTY_SLOT) {
-               return;
-       }
-#endif
-
-       char enqueue_flag = 0;
-
-#ifndef __COMPUTE_DEVICE_GPU__
-       if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
+                                 1);
+       get_ray_index(kg, thread_index,
+                     QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+                     kernel_split_state.queue_data,
+                     kernel_split_params.queue_size,
+                     1);
 
+#ifdef __SUBSURFACE__
        ccl_global char *ray_state = kernel_split_state.ray_state;
        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -64,34 +228,85 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
 
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
                if(sd->flag & SD_BSSRDF) {
-                       if(kernel_path_subsurface_scatter(kg,
-                                                         sd,
-                                                         emission_sd,
-                                                         L,
-                                                         state,
-                                                         &rng,
-                                                         ray,
-                                                         throughput,
-                                                         ss_indirect)) {
-                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-                               enqueue_flag = 1;
+
+#ifdef __BRANCHED_PATH__
+                       if(!kernel_data.integrator.branched) {
+#endif
+                               if(kernel_path_subsurface_scatter(kg,
+                                                                     sd,
+                                                                     emission_sd,
+                                                                     L,
+                                                                     state,
+                                                                     &rng,
+                                                                     ray,
+                                                                     throughput,
+                                                                     ss_indirect)) {
+                                       kernel_split_path_end(kg, ray_index);
+                               }
+#ifdef __BRANCHED_PATH__
+                       }
+                       else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+                               float bssrdf_probability;
+                               ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+                               /* modify throughput for picking bssrdf or bsdf */
+                               *throughput *= bssrdf_probability;
+
+                               /* do bssrdf scatter step if we picked a bssrdf closure */
+                               if(sc) {
+                                       uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
+
+                                       float bssrdf_u, bssrdf_v;
+                                       path_state_rng_2D(kg,
+                                                             &rng,
+                                                             state,
+                                                             PRNG_BSDF_U,
+                                                             &bssrdf_u, &bssrdf_v);
+                                       subsurface_scatter_step(kg,
+                                                                   sd,
+                                                                   state,
+                                                                   state->flag,
+                                                                   sc,
+                                                                   &lcg_state,
+                                                                   bssrdf_u, bssrdf_v,
+                                                                   false);
+                               }
+                       }
+                       else {
+                               kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+                               if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+                               }
                        }
+#endif
                }
                kernel_split_state.rng[ray_index] = rng;
        }
 
-#ifndef __COMPUTE_DEVICE_GPU__
+#  ifdef __BRANCHED_PATH__
+       if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+               kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
        }
-#endif
 
-       /* Enqueue RAY_UPDATE_BUFFER rays. */
-       enqueue_ray_index_local(ray_index,
-                               QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                               enqueue_flag,
-                               kernel_split_params.queue_size,
-                               local_queue_atomics,
-                               kernel_split_state.queue_data,
-                               kernel_split_params.queue_index);
+       /* iter loop */
+       ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+                                 QUEUE_SUBSURFACE_INDIRECT_ITER,
+                                 kernel_split_state.queue_data,
+                                 kernel_split_params.queue_size,
+                                 1);
+
+       if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+               /* for render passes, sum and reset indirect light pass variables
+                * for the next samples */
+               path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+               path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+               if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+               }
+       }
+#  endif  /* __BRANCHED_PATH__ */
 
 #endif  /* __SUBSURFACE__ */