Cycles: Remove sum_all_radiance kernel
authorMai Lavelle <mai.lavelle@gmail.com>
Wed, 1 Mar 2017 06:47:08 +0000 (01:47 -0500)
committerMai Lavelle <mai.lavelle@gmail.com>
Wed, 8 Mar 2017 06:31:07 +0000 (01:31 -0500)
This was only needed for the previous implementation of parallel samples. As
we don't have that any more it can be removed.

Real reason for removal tho is this: `per_sample_output_buffers` was being
calculated too small and artifacts resulted. The tile buffer is already
the correct size and calculating the size for `per_sample_output_buffers`
is a bit difficult with the current layout of the code. As
`per_sample_output_buffers` was only needed for `sum_all_radiance`,
removing that kernel and writing output to the tile buffer directly
fixes the artifacts.

13 files changed:
intern/cycles/device/device_split_kernel.cpp
intern/cycles/device/device_split_kernel.h
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernels/cpu/kernel_cpu.h
intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
intern/cycles/kernel/kernels/cuda/kernel_split.cu
intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl [deleted file]
intern/cycles/kernel/split/kernel_background_buffer_update.h
intern/cycles/kernel/split/kernel_data_init.h
intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
intern/cycles/kernel/split/kernel_path_init.h
intern/cycles/kernel/split/kernel_split_data.h
intern/cycles/kernel/split/kernel_sum_all_radiance.h [deleted file]

index 85da7024a2ce1c4c1ceabc47eca9490ccecd0fbe..13fee6c02e473ab25a19ab48f8869526642d28ba 100644 (file)
@@ -51,7 +51,6 @@ DeviceSplitKernel::~DeviceSplitKernel()
        delete kernel_direct_lighting;
        delete kernel_shadow_blocked;
        delete kernel_next_iteration_setup;
-       delete kernel_sum_all_radiance;
 }
 
 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
@@ -72,7 +71,6 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
        LOAD_KERNEL(direct_lighting);
        LOAD_KERNEL(shadow_blocked);
        LOAD_KERNEL(next_iteration_setup);
-       LOAD_KERNEL(sum_all_radiance);
 
 #undef LOAD_KERNEL
 
@@ -258,15 +256,6 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
                        avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
                }
 
-               size_t sum_all_radiance_local_size[2] = {16, 16};
-               size_t sum_all_radiance_global_size[2];
-               sum_all_radiance_global_size[0] = round_up(tile.w, sum_all_radiance_local_size[0]);
-               sum_all_radiance_global_size[1] = round_up(tile.h, sum_all_radiance_local_size[1]);
-
-               ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
-                                    sum_all_radiance_global_size,
-                                    sum_all_radiance_local_size);
-
 #undef ENQUEUE_SPLIT_KERNEL
 
                tile.sample += subtile.num_samples;
index 1903574f0b53ec5202970786e1cbeffc4379b6ea..1c6a2709cf2dcc72be15dfcb5c026c667d2f41dd 100644 (file)
@@ -65,7 +65,6 @@ private:
        SplitKernelFunction *kernel_direct_lighting;
        SplitKernelFunction *kernel_shadow_blocked;
        SplitKernelFunction *kernel_next_iteration_setup;
-       SplitKernelFunction *kernel_sum_all_radiance;
 
        /* Global memory variables [porting]; These memory is used for
         * co-operation between different kernels; Data written by one
index d467e40b3e94f443829b903cbece889ae7aeba23..df40c3a0e8e1af86fbc2e533effeb9640d1dd307 100644 (file)
@@ -26,7 +26,6 @@ set(SRC
        kernels/opencl/kernel_direct_lighting.cl
        kernels/opencl/kernel_shadow_blocked.cl
        kernels/opencl/kernel_next_iteration_setup.cl
-       kernels/opencl/kernel_sum_all_radiance.cl
        kernels/cuda/kernel.cu
        kernels/cuda/kernel_split.cu
 )
@@ -209,7 +208,6 @@ set(SRC_SPLIT_HEADERS
        split/kernel_shadow_blocked.h
        split/kernel_split_common.h
        split/kernel_split_data.h
-       split/kernel_sum_all_radiance.h
 )
 
 # CUDA module
@@ -412,7 +410,6 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emiss
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
index 8c1675665cb2df307a00e1ddc853545f3b8a6229..deb872444d0840d67d82dcb11bd2914577ebde21 100644 (file)
@@ -81,7 +81,6 @@ DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
 DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
 DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
 
 void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
 
index f6e0591ef24bb658006d7cf41e50ce59ec0be0b3..d6d0db4e034dc4ffd5c1eb70046359600a4712aa 100644 (file)
@@ -48,7 +48,6 @@
 #  include "split/kernel_direct_lighting.h"
 #  include "split/kernel_shadow_blocked.h"
 #  include "split/kernel_next_iteration_setup.h"
-#  include "split/kernel_sum_all_radiance.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -174,7 +173,6 @@ DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
 DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
 
 void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
 {
@@ -198,7 +196,6 @@ void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name,
        REGISTER(direct_lighting);
        REGISTER(shadow_blocked);
        REGISTER(next_iteration_setup);
-       REGISTER(sum_all_radiance);
 
 #undef REGISTER
 #undef REGISTER_EVAL_NAME
index 3a883265157e1ec3cf9d1e859db2289e81ec6cae..53a36b15e40fe5d0e9fac26818651f12864d0fc4 100644 (file)
@@ -35,7 +35,6 @@
 #include "../../split/kernel_direct_lighting.h"
 #include "../../split/kernel_shadow_blocked.h"
 #include "../../split/kernel_next_iteration_setup.h"
-#include "../../split/kernel_sum_all_radiance.h"
 
 #include "../../kernel_film.h"
 
@@ -92,7 +91,6 @@ DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
 DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
deleted file mode 100644 (file)
index e945050..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_compat_opencl.h"
-#include "split/kernel_split_common.h"
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
-        KernelGlobals *kg,
-        ccl_constant KernelData *data)
-{
-       kernel_sum_all_radiance(kg);
-}
index 07e5522c83020223f5f3f0e1a376eedbef87aaac..04aaf1bbaad4d431688875aabe7310ad5d210496 100644 (file)
@@ -119,7 +119,7 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
        ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
        ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
        ccl_global uint *rng = &kernel_split_state.rng[ray_index];
-       ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
+       ccl_global float *buffer = kernel_split_params.buffer;
 
        unsigned int work_index;
        ccl_global uint *initial_rng;
@@ -129,7 +129,6 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
        unsigned int tile_y;
        unsigned int pixel_x;
        unsigned int pixel_y;
-       unsigned int my_sample_tile;
 
        work_index = kernel_split_state.work_array[ray_index];
        sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
@@ -137,11 +136,10 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
                                &tile_x, &tile_y,
                                work_index,
                                ray_index);
-       my_sample_tile = 0;
        initial_rng = rng_state;
 
-       rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
-       per_sample_output_buffers += ((tile_x + (tile_y * stride)) + my_sample_tile) * kernel_data.film.pass_stride;
+       rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+       buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
        if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
                /* eval background shader if nothing hit */
@@ -165,14 +163,14 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 
        if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
                float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-               kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+               kernel_write_light_passes(kg, buffer, L, sample);
 #ifdef __KERNEL_DEBUG__
-               kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+               kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
 #endif
                float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
 
                /* accumulate result in output buffer */
-               kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+               kernel_write_pass_float4(buffer, sample, L_rad);
                path_rng_end(kg, rng_state, *rng);
 
                ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
@@ -192,13 +190,11 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
                        sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
                        /* Get pixel and tile position associated with current work */
                        get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
-                       my_sample_tile = 0;
 
                        /* Remap rng_state according to the current work */
-                       rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
-                       /* Remap per_sample_output_buffers according to the current work */
-                       per_sample_output_buffers = kernel_split_state.per_sample_output_buffers
-                               + ((tile_x + (tile_y * stride)) + my_sample_tile) * kernel_data.film.pass_stride;
+                       rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
+                       /* Remap buffer according to the current work */
+                       buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
                        /* Initialize random numbers and ray. */
                        kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
@@ -221,7 +217,7 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
                                /* These rays do not participate in path-iteration. */
                                float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
                                /* Accumulate result in output buffer. */
-                               kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+                               kernel_write_pass_float4(buffer, sample, L_rad);
                                path_rng_end(kg, rng_state, *rng);
 
                                ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
index 982c7be2008549aa5b8e15e8062e1406069736e7..c22703e5abd9a86d8c7d627a14f62b78e6676373 100644 (file)
@@ -24,6 +24,21 @@ CCL_NAMESPACE_BEGIN
  * The number of elements in the queues is initialized to 0;
  */
 
+/* distributes an amount of work across all threads
+ * note: work done inside the loop may not show up to all threads till after the current kernel has completed
+ */
+#define parallel_for(kg, iter_name, work_size) \
+       for(size_t _size = (work_size), \
+           _global_size = ccl_global_size(0) * ccl_global_size(1), \
+           _n = _size / _global_size, \
+               _thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \
+           iter_name = (_n > 0) ? (_thread * _n) : (_thread) \
+               ; \
+               (iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \
+               ; \
+               iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \
+       )
+
 #ifndef __KERNEL_CPU__
 ccl_device void kernel_data_init(
 #else
@@ -110,6 +125,21 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
                 */
                *use_queues_flag = 0;
        }
+
+       /* zero the tiles pixels if this is the first sample */
+       if(start_sample == 0) {
+               parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
+                       int pixel = i / kernel_data.film.pass_stride;
+                       int pass = i % kernel_data.film.pass_stride;
+
+                       int x = sx + pixel % sw;
+                       int y = sy + pixel / sw;
+
+                       int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
+
+                       *(buffer + index) = 0.0f;
+               }
+       }
 }
 
 CCL_NAMESPACE_END
index ee9c4280b2233401992e8b5a692384aa2aa335f1..7168efa59aeda1ad6405a0fb1244ca35ea02d473 100644 (file)
@@ -114,7 +114,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
 
        unsigned int tile_x;
        unsigned int tile_y;
-       int my_sample_tile;
        unsigned int sample;
 
        ccl_global RNG *rng = 0x0;
@@ -123,7 +122,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
 
        ccl_global char *ray_state = kernel_split_state.ray_state;
        ShaderData *sd = &kernel_split_state.sd[ray_index];
-       ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
+       ccl_global float *buffer = kernel_split_params.buffer;
 
        if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 
@@ -137,11 +136,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
                                        &tile_x, &tile_y,
                                        work_index,
                                        ray_index);
-               my_sample_tile = 0;
 
-               per_sample_output_buffers +=
-                   ((tile_x + (tile_y * stride)) + my_sample_tile) *
-                   kernel_data.film.pass_stride;
+               buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
 
                /* holdout */
 #ifdef __HOLDOUT__
@@ -172,7 +168,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
                PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
                /* Holdout mask objects do not write data passes. */
                kernel_write_data_passes(kg,
-                                        per_sample_output_buffers,
+                                        buffer,
                                         L,
                                         sd,
                                         sample,
index e613db214ed4d8c7c7afa2763aa061a1438ae9f3..d2e2ffaca91df1fcb43175f0f02b843e968e1c99 100644 (file)
@@ -35,7 +35,6 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
        unsigned int pixel_y;
        unsigned int tile_x;
        unsigned int tile_y;
-       unsigned int my_sample_tile;
 
        unsigned int work_index = 0;
        /* Get work. */
@@ -49,8 +48,6 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
        /* Get the sample associated with the work. */
        my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
 
-       my_sample_tile = 0;
-
        /* Get pixel and tile position associated with the work. */
        get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
                                &tile_x, &tile_y,
@@ -61,9 +58,8 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
        ccl_global uint *rng_state = kernel_split_params.rng_state;
        rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
 
-       ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
-       per_sample_output_buffers += (tile_x + tile_y * kernel_split_params.stride + my_sample_tile)
-                                    * kernel_data.film.pass_stride;
+       ccl_global float *buffer = kernel_split_params.buffer;
+       buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
 
        /* Initialize random numbers and ray. */
        kernel_path_trace_setup(kg,
@@ -94,7 +90,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
                /* These rays do not participate in path-iteration. */
                float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
                /* Accumulate result in output buffer. */
-               kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+               kernel_write_pass_float4(buffer, my_sample, L_rad);
                path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
                ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
        }
index 7e88b6f51687a25d02775c064cda50a9b0653bed..5dd53f42478e6906dbebcce6775d0fdfd851e5c2 100644 (file)
@@ -89,7 +89,6 @@ typedef struct SplitData {
        /* size calculation for these is non trivial, so they are left out of SPLIT_DATA_ENTRIES and handled separately */
        ShaderData *sd;
        ShaderData *sd_DL_shadow;
-       ccl_global float *per_sample_output_buffers;
 
        /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
         * the host easily) but is still used the same as the other data so we have it here in this struct as well
@@ -113,7 +112,6 @@ ccl_device_inline size_t split_data_buffer_size(size_t num_elements,
         */
        size += align_up(num_elements * SIZEOF_SD(max_closure), 16); /* sd */
        size += align_up(2 * num_elements * SIZEOF_SD(max_closure), 16); /* sd_DL_shadow */
-       size += align_up(num_elements * per_thread_output_buffer_size, 16); /* per_sample_output_buffers */
 
        return size;
 }
@@ -136,9 +134,6 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
        split_data->sd_DL_shadow = (ShaderData*)p;
        p += align_up(2 * num_elements * SIZEOF_SD(MAX_CLOSURE), 16);
 
-       split_data->per_sample_output_buffers = (ccl_global float*)p;
-       //p += align_up(num_elements * per_thread_output_buffer_size, 16);
-
        split_data->ray_state = ray_state;
 }
 
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644 (file)
index fdceae2..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-
-ccl_device void kernel_sum_all_radiance(KernelGlobals *kg)
-{
-       int x = ccl_global_id(0);
-       int y = ccl_global_id(1);
-
-       ccl_global float *buffer = kernel_split_params.buffer;
-       int sw = kernel_split_params.w;
-       int sh = kernel_split_params.h;
-       int stride = kernel_split_params.stride;
-       int start_sample = kernel_split_params.start_sample;
-
-       if(x < sw && y < sh) {
-               ccl_global float *per_sample_output_buffer = kernel_split_state.per_sample_output_buffers;
-               per_sample_output_buffer += (x + y * stride) * (kernel_data.film.pass_stride);
-
-               x += kernel_split_params.x;
-               y += kernel_split_params.y;
-
-               buffer += (kernel_split_params.offset + x + y*stride) * (kernel_data.film.pass_stride);
-
-               int pass_stride_iterator = 0;
-               int num_floats = kernel_data.film.pass_stride;
-
-               for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
-                       *(buffer + pass_stride_iterator) =
-                               (start_sample == 0)
-                                       ? *(per_sample_output_buffer + pass_stride_iterator)
-                                       : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
-               }
-       }
-}
-
-CCL_NAMESPACE_END
-