Cycles: Define ccl_local variables in kernel functions
authorSergey Sharybin <sergey.vfx@gmail.com>
Wed, 8 Mar 2017 12:34:29 +0000 (13:34 +0100)
committerSergey Sharybin <sergey.vfx@gmail.com>
Thu, 16 Mar 2017 10:27:17 +0000 (11:27 +0100)
Declaring ccl_local in a device function is not supported
by certain compilers.

18 files changed:
intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
intern/cycles/kernel/kernels/cuda/kernel_split.cu
intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
intern/cycles/kernel/split/kernel_buffer_update.h
intern/cycles/kernel/split/kernel_direct_lighting.h
intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
intern/cycles/kernel/split/kernel_indirect_background.h
intern/cycles/kernel/split/kernel_next_iteration_setup.h
intern/cycles/kernel/split/kernel_queue_enqueue.h
intern/cycles/kernel/split/kernel_shader_eval.h
intern/cycles/kernel/split/kernel_split_data_types.h
intern/cycles/kernel/split/kernel_subsurface_scatter.h

index ba6b10339153e098e5d98752a88c89e65722a50f..e220d8573842564a947c7cf8f97e3834a638e808 100644 (file)
@@ -168,21 +168,28 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                kernel_##name(kg); \
        }
 
+#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+       void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+       { \
+               ccl_local type locals; \
+               kernel_##name(kg, &locals); \
+       }
+
 DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
 DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
 DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION(buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
 
 void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
 {
index fbdf79697d5adaabbce8938d140ae97622d30716..4479a044921e547e17f030e9918ddf77c1227648 100644 (file)
@@ -93,21 +93,30 @@ kernel_cuda_path_trace_data_init(
                kernel_##name(NULL); \
        }
 
+#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+       extern "C" __global__ void \
+       CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+       kernel_cuda_##name() \
+       { \
+               ccl_local type locals; \
+               kernel_##name(NULL, &locals); \
+       }
+
 DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
 DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
 DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION(buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
index 3c25d1d85a236cca64f2adb7e136e379c5c230e3..b61f1cda330b72f80be93fa77192937646e4eae8 100644 (file)
@@ -22,5 +22,6 @@ __kernel void kernel_ocl_path_trace_buffer_update(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       kernel_buffer_update((KernelGlobals*)kg);
+       ccl_local unsigned int local_queue_atomics;
+       kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
 }
index 942a80f94f55eac237232e8d638f9e5a630dab56..374be6cbd05eaa200f43197c10885cbdd4d181ab 100644 (file)
@@ -22,5 +22,6 @@ __kernel void kernel_ocl_path_trace_direct_lighting(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       kernel_direct_lighting((KernelGlobals*)kg);
+       ccl_local unsigned int local_queue_atomics;
+       kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
 }
index 209080fecd63ae7756084e9b47d3975874b94eab..351687e2036d2b5f20ef32e2a52281c1513495ca 100644 (file)
@@ -22,5 +22,8 @@ __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       kernel_holdout_emission_blurring_pathtermination_ao((KernelGlobals*)kg);
+       ccl_local BackgroundAOLocals locals;
+       kernel_holdout_emission_blurring_pathtermination_ao(
+               (KernelGlobals*)kg,
+               &locals);
 }
index 2a007e39c33314cd9df44be47a564db23a2945cf..fd49ed5def8b3fed9bb89f0b463737c12479aea7 100644 (file)
@@ -22,5 +22,6 @@ __kernel void kernel_ocl_path_trace_next_iteration_setup(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       kernel_next_iteration_setup((KernelGlobals*)kg);
+       ccl_local unsigned int local_queue_atomics;
+       kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
 }
index 19074db1b811f6f179896163c7d10cdad472b747..6dd9d39c4e2f4f7e4f34225801604adceff98647 100644 (file)
@@ -22,5 +22,6 @@ __kernel void kernel_ocl_path_trace_queue_enqueue(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       kernel_queue_enqueue((KernelGlobals*)kg);
+       ccl_local QueueEnqueueLocals locals;
+       kernel_queue_enqueue((KernelGlobals*)kg, &locals);
 }
index 534d37f695bcc72598e090c997ec3533e3fbf6aa..71ac2886978781b314a44ba80f720fe9d9803c91 100644 (file)
@@ -22,5 +22,6 @@ __kernel void kernel_ocl_path_trace_shader_eval(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       kernel_shader_eval((KernelGlobals*)kg);
+       ccl_local unsigned int local_queue_atomics;
+       kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics);
 }
index 34a01bbdfe360324624eef33adbb708f24b9c8fd..853bba2efc5c6ef8ccf52f7a8bfcca45325e159e 100644 (file)
@@ -22,5 +22,6 @@ __kernel void kernel_ocl_path_trace_subsurface_scatter(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       kernel_subsurface_scatter((KernelGlobals*)kg);
+       ccl_local unsigned int local_queue_atomics;
+       kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
 }
index e8f574c55467ca0f18ba1a0f9f54ac0a8160241f..f36899b884a8ba8b85c553a3fa8ee036ff3397cb 100644 (file)
@@ -38,11 +38,11 @@ CCL_NAMESPACE_BEGIN
  *     RAY_REGENERATED rays.
  *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
  */
-ccl_device void kernel_buffer_update(KernelGlobals *kg)
+ccl_device void kernel_buffer_update(KernelGlobals *kg,
+                                     ccl_local_param unsigned int *local_queue_atomics)
 {
-       ccl_local unsigned int local_queue_atomics;
        if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               local_queue_atomics = 0;
+               *local_queue_atomics = 0;
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
@@ -188,7 +188,7 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg)
                                QUEUE_ACTIVE_AND_REGENERATED_RAYS,
                                enqueue_flag,
                                kernel_split_params.queue_size,
-                               &local_queue_atomics,
+                               local_queue_atomics,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
 }
index dfe461fb35739274f0513c265b574ee40e6c6f6a..3d062cf0e2b81e9f60c1f09e061add3deff0bada 100644 (file)
@@ -40,11 +40,11 @@ CCL_NAMESPACE_BEGIN
  *   shadow_blocked function must be executed, after this kernel call
  *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
  */
-ccl_device void kernel_direct_lighting(KernelGlobals *kg)
+ccl_device void kernel_direct_lighting(KernelGlobals *kg,
+                                       ccl_local_param unsigned int *local_queue_atomics)
 {
-       ccl_local unsigned int local_queue_atomics;
        if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               local_queue_atomics = 0;
+               *local_queue_atomics = 0;
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
@@ -130,7 +130,7 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg)
                                QUEUE_SHADOW_RAY_CAST_DL_RAYS,
                                enqueue_flag,
                                kernel_split_params.queue_size,
-                               &local_queue_atomics,
+                               local_queue_atomics,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
 #endif
index bb948ad24b0775954e96976f5ea59956f823521b..e4bf513ffdc89671cf6f1d71cf1d4f010f333dc8 100644 (file)
@@ -52,13 +52,13 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
  *     flag RAY_SHADOW_RAY_CAST_AO
  */
-ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobals *kg)
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
+        KernelGlobals *kg,
+        ccl_local_param BackgroundAOLocals *locals)
 {
-       ccl_local unsigned int local_queue_atomics_bg;
-       ccl_local unsigned int local_queue_atomics_ao;
        if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               local_queue_atomics_bg = 0;
-               local_queue_atomics_ao = 0;
+               locals->queue_atomics_bg = 0;
+               locals->queue_atomics_ao = 0;
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
@@ -253,7 +253,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
                                QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
                                enqueue_flag,
                                kernel_split_params.queue_size,
-                               &local_queue_atomics_bg,
+                               &locals->queue_atomics_bg,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
 
@@ -263,7 +263,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
                                QUEUE_SHADOW_RAY_CAST_AO_RAYS,
                                enqueue_flag_AO_SHADOW_RAY_CAST,
                                kernel_split_params.queue_size,
-                               &local_queue_atomics_ao,
+                               &locals->queue_atomics_bg,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
 #endif
index 96ca0f094b133bdd2d36380c3194b73cd5270844..100f5996f8337286d57920e533b8096ac1a8690c 100644 (file)
@@ -18,7 +18,6 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_indirect_background(KernelGlobals *kg)
 {
-
        ccl_global char *ray_state = kernel_split_state.ray_state;
 
        int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
index ad1f6c78e8f9baff1f945d6bfb28cdc9c4760042..056fb1d8c0842aed0d077c7036da88006ffdc368 100644 (file)
@@ -44,11 +44,11 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
  *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
-ccl_device void kernel_next_iteration_setup(KernelGlobals *kg)
+ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
+                                            ccl_local_param unsigned int *local_queue_atomics)
 {
-       ccl_local unsigned int local_queue_atomics;
        if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               local_queue_atomics = 0;
+               *local_queue_atomics = 0;
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
@@ -161,7 +161,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg)
                                QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
                                enqueue_flag,
                                kernel_split_params.queue_size,
-                               &local_queue_atomics,
+                               local_queue_atomics,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
 }
index f4a4657d23f34d1925ad36838cb5dfdb394cb86f..e2e841f36d3b31f18b370dd87716886ab9a41d55 100644 (file)
@@ -35,17 +35,16 @@ CCL_NAMESPACE_BEGIN
  *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
  *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
  */
-ccl_device void kernel_queue_enqueue(KernelGlobals *kg)
+ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
+                                     ccl_local_param QueueEnqueueLocals *locals)
 {
        /* We have only 2 cases (Hit/Not-Hit) */
-       ccl_local unsigned int local_queue_atomics[2];
-
        int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
        int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 
        if(lidx == 0) {
-               local_queue_atomics[0] = 0;
-               local_queue_atomics[1] = 0;
+               locals->queue_atomics[0] = 0;
+               locals->queue_atomics[1] = 0;
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
@@ -62,18 +61,18 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg)
 
        unsigned int my_lqidx;
        if(queue_number != -1) {
-               my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+               my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
        if(lidx == 0) {
-               local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+               locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
                        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                                                   local_queue_atomics,
+                                                   locals->queue_atomics,
                                                    kernel_split_params.queue_index);
-               local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+               locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
                        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                                                   local_queue_atomics,
+                                                   locals->queue_atomics,
                                                    kernel_split_params.queue_index);
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
@@ -83,7 +82,7 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg)
                my_gqidx = get_global_queue_index(queue_number,
                                                  kernel_split_params.queue_size,
                                                  my_lqidx,
-                                                 local_queue_atomics);
+                                                 locals->queue_atomics);
                kernel_split_state.queue_data[my_gqidx] = ray_index;
        }
 }
index 43872c6f38844d5dc5b5cb616388f0d0f3472ef6..fc966b77b2c3189178c658179b9c3e715d3e0bea 100644 (file)
@@ -22,12 +22,12 @@ CCL_NAMESPACE_BEGIN
  * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
  * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
  */
-ccl_device void kernel_shader_eval(KernelGlobals *kg)
+ccl_device void kernel_shader_eval(KernelGlobals *kg,
+                                   ccl_local_param unsigned int *local_queue_atomics)
 {
        /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-       ccl_local unsigned int local_queue_atomics;
        if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               local_queue_atomics = 0;
+               *local_queue_atomics = 0;
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
@@ -47,7 +47,7 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
                                QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
                                enqueue_flag,
                                kernel_split_params.queue_size,
-                               &local_queue_atomics,
+                               local_queue_atomics,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);
 
index 365d78c9f99934bf07317474150fa0f332bff2bc..748197b718315c187095c7cfef73d79cabf0484f 100644 (file)
@@ -111,6 +111,17 @@ __device__ SplitParams __split_param_data;
 #  define kernel_split_params (__split_param_data)
 #endif  /* __KERNEL_CUDA__ */
 
+/* Local storage for queue_enqueue kernel. */
+typedef struct QueueEnqueueLocals {
+       uint queue_atomics[2];
+} QueueEnqueueLocals;
+
+/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
+typedef struct BackgroundAOLocals {
+       uint queue_atomics_bg;
+       uint queue_atomics_ao;
+} BackgroundAOLocals;
+
 CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_DATA_TYPES_H__ */
index e282ac00a63a965a2a840a7c7f2afba6e6b8ba04..709a296c9a09a8d7f5d8c5284b5c58122cff3d3b 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
+                                          ccl_local_param unsigned int* local_queue_atomics)
 {
 #ifdef __SUBSURFACE__
-
-       ccl_local unsigned int local_queue_atomics;
        if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-               local_queue_atomics = 0;
+               *local_queue_atomics = 0;
        }
        ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
@@ -89,7 +88,7 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
                                QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
                                enqueue_flag,
                                kernel_split_params.queue_size,
-                               &local_queue_atomics,
+                               local_queue_atomics,
                                kernel_split_state.queue_data,
                                kernel_split_params.queue_index);