delete kernel_direct_lighting;
delete kernel_shadow_blocked;
delete kernel_next_iteration_setup;
- delete kernel_sum_all_radiance;
}
bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
LOAD_KERNEL(direct_lighting);
LOAD_KERNEL(shadow_blocked);
LOAD_KERNEL(next_iteration_setup);
- LOAD_KERNEL(sum_all_radiance);
#undef LOAD_KERNEL
avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
}
- size_t sum_all_radiance_local_size[2] = {16, 16};
- size_t sum_all_radiance_global_size[2];
- sum_all_radiance_global_size[0] = round_up(tile.w, sum_all_radiance_local_size[0]);
- sum_all_radiance_global_size[1] = round_up(tile.h, sum_all_radiance_local_size[1]);
-
- ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
- sum_all_radiance_global_size,
- sum_all_radiance_local_size);
-
#undef ENQUEUE_SPLIT_KERNEL
tile.sample += subtile.num_samples;
SplitKernelFunction *kernel_direct_lighting;
SplitKernelFunction *kernel_shadow_blocked;
SplitKernelFunction *kernel_next_iteration_setup;
- SplitKernelFunction *kernel_sum_all_radiance;
/* Global memory variables [porting]; These memory is used for
* co-operation between different kernels; Data written by one
kernels/opencl/kernel_direct_lighting.cl
kernels/opencl/kernel_shadow_blocked.cl
kernels/opencl/kernel_next_iteration_setup.cl
- kernels/opencl/kernel_sum_all_radiance.cl
kernels/cuda/kernel.cu
kernels/cuda/kernel_split.cu
)
split/kernel_shadow_blocked.h
split/kernel_split_common.h
split/kernel_split_data.h
- split/kernel_sum_all_radiance.h
)
# CUDA module
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
# include "split/kernel_direct_lighting.h"
# include "split/kernel_shadow_blocked.h"
# include "split/kernel_next_iteration_setup.h"
-# include "split/kernel_sum_all_radiance.h"
#endif
CCL_NAMESPACE_BEGIN
DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
{
REGISTER(direct_lighting);
REGISTER(shadow_blocked);
REGISTER(next_iteration_setup);
- REGISTER(sum_all_radiance);
#undef REGISTER
#undef REGISTER_EVAL_NAME
#include "../../split/kernel_direct_lighting.h"
#include "../../split/kernel_shadow_blocked.h"
#include "../../split/kernel_next_iteration_setup.h"
-#include "../../split/kernel_sum_all_radiance.h"
#include "../../kernel_film.h"
DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+++ /dev/null
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_compat_opencl.h"
-#include "split/kernel_split_common.h"
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
- KernelGlobals *kg,
- ccl_constant KernelData *data)
-{
- kernel_sum_all_radiance(kg);
-}
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
ccl_global uint *rng = &kernel_split_state.rng[ray_index];
- ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
+ ccl_global float *buffer = kernel_split_params.buffer;
unsigned int work_index;
ccl_global uint *initial_rng;
unsigned int tile_y;
unsigned int pixel_x;
unsigned int pixel_y;
- unsigned int my_sample_tile;
work_index = kernel_split_state.work_array[ray_index];
sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
&tile_x, &tile_y,
work_index,
ray_index);
- my_sample_tile = 0;
initial_rng = rng_state;
- rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
- per_sample_output_buffers += ((tile_x + (tile_y * stride)) + my_sample_tile) * kernel_data.film.pass_stride;
+ rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+ buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
/* eval background shader if nothing hit */
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
float3 L_sum = path_radiance_clamp_and_sum(kg, L);
- kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+ kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __KERNEL_DEBUG__
- kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+ kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
#endif
float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
/* accumulate result in output buffer */
- kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+ kernel_write_pass_float4(buffer, sample, L_rad);
path_rng_end(kg, rng_state, *rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
/* Get pixel and tile position associated with current work */
get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
- my_sample_tile = 0;
/* Remap rng_state according to the current work */
- rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
- /* Remap per_sample_output_buffers according to the current work */
- per_sample_output_buffers = kernel_split_state.per_sample_output_buffers
- + ((tile_x + (tile_y * stride)) + my_sample_tile) * kernel_data.film.pass_stride;
+ rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
+ /* Remap buffer according to the current work */
+ buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
/* Initialize random numbers and ray. */
kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
/* These rays do not participate in path-iteration. */
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* Accumulate result in output buffer. */
- kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+ kernel_write_pass_float4(buffer, sample, L_rad);
path_rng_end(kg, rng_state, *rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
* The number of elements in the queues is initialized to 0;
*/
+/* distributes an amount of work across all threads
+ * note: work done inside the loop may not show up to all threads till after the current kernel has completed
+ */
+#define parallel_for(kg, iter_name, work_size) \
+ for(size_t _size = (work_size), \
+ _global_size = ccl_global_size(0) * ccl_global_size(1), \
+ _n = _size / _global_size, \
+ _thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \
+ iter_name = (_n > 0) ? (_thread * _n) : (_thread) \
+ ; \
+ (iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \
+ ; \
+ iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \
+ )
+
#ifndef __KERNEL_CPU__
ccl_device void kernel_data_init(
#else
*/
*use_queues_flag = 0;
}
+
+ /* zero the tiles pixels if this is the first sample */
+ if(start_sample == 0) {
+ parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
+ int pixel = i / kernel_data.film.pass_stride;
+ int pass = i % kernel_data.film.pass_stride;
+
+ int x = sx + pixel % sw;
+ int y = sy + pixel / sw;
+
+ int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
+
+ *(buffer + index) = 0.0f;
+ }
+ }
}
CCL_NAMESPACE_END
unsigned int tile_x;
unsigned int tile_y;
- int my_sample_tile;
unsigned int sample;
ccl_global RNG *rng = 0x0;
ccl_global char *ray_state = kernel_split_state.ray_state;
ShaderData *sd = &kernel_split_state.sd[ray_index];
- ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
+ ccl_global float *buffer = kernel_split_params.buffer;
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
&tile_x, &tile_y,
work_index,
ray_index);
- my_sample_tile = 0;
- per_sample_output_buffers +=
- ((tile_x + (tile_y * stride)) + my_sample_tile) *
- kernel_data.film.pass_stride;
+ buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
/* holdout */
#ifdef __HOLDOUT__
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
/* Holdout mask objects do not write data passes. */
kernel_write_data_passes(kg,
- per_sample_output_buffers,
+ buffer,
L,
sd,
sample,
unsigned int pixel_y;
unsigned int tile_x;
unsigned int tile_y;
- unsigned int my_sample_tile;
unsigned int work_index = 0;
/* Get work. */
/* Get the sample associated with the work. */
my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
- my_sample_tile = 0;
-
/* Get pixel and tile position associated with the work. */
get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
&tile_x, &tile_y,
ccl_global uint *rng_state = kernel_split_params.rng_state;
rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
- ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
- per_sample_output_buffers += (tile_x + tile_y * kernel_split_params.stride + my_sample_tile)
- * kernel_data.film.pass_stride;
+ ccl_global float *buffer = kernel_split_params.buffer;
+ buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
/* Initialize random numbers and ray. */
kernel_path_trace_setup(kg,
/* These rays do not participate in path-iteration. */
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* Accumulate result in output buffer. */
- kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+ kernel_write_pass_float4(buffer, my_sample, L_rad);
path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
}
/* size calculation for these is non trivial, so they are left out of SPLIT_DATA_ENTRIES and handled separately */
ShaderData *sd;
ShaderData *sd_DL_shadow;
- ccl_global float *per_sample_output_buffers;
/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
* the host easily) but is still used the same as the other data so we have it here in this struct as well
*/
size += align_up(num_elements * SIZEOF_SD(max_closure), 16); /* sd */
size += align_up(2 * num_elements * SIZEOF_SD(max_closure), 16); /* sd_DL_shadow */
- size += align_up(num_elements * per_thread_output_buffer_size, 16); /* per_sample_output_buffers */
return size;
}
split_data->sd_DL_shadow = (ShaderData*)p;
p += align_up(2 * num_elements * SIZEOF_SD(MAX_CLOSURE), 16);
- split_data->per_sample_output_buffers = (ccl_global float*)p;
- //p += align_up(num_elements * per_thread_output_buffer_size, 16);
-
split_data->ray_state = ray_state;
}
+++ /dev/null
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-
-ccl_device void kernel_sum_all_radiance(KernelGlobals *kg)
-{
- int x = ccl_global_id(0);
- int y = ccl_global_id(1);
-
- ccl_global float *buffer = kernel_split_params.buffer;
- int sw = kernel_split_params.w;
- int sh = kernel_split_params.h;
- int stride = kernel_split_params.stride;
- int start_sample = kernel_split_params.start_sample;
-
- if(x < sw && y < sh) {
- ccl_global float *per_sample_output_buffer = kernel_split_state.per_sample_output_buffers;
- per_sample_output_buffer += (x + y * stride) * (kernel_data.film.pass_stride);
-
- x += kernel_split_params.x;
- y += kernel_split_params.y;
-
- buffer += (kernel_split_params.offset + x + y*stride) * (kernel_data.film.pass_stride);
-
- int pass_stride_iterator = 0;
- int num_floats = kernel_data.film.pass_stride;
-
- for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
- *(buffer + pass_stride_iterator) =
- (start_sample == 0)
- ? *(per_sample_output_buffer + pass_stride_iterator)
- : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
- }
- }
-}
-
-CCL_NAMESPACE_END
-