2 * Copyright 2011-2015 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "kernel_split_common.h"
19 /* Note on kernel_data_initialization kernel
20 * This kernel Initializes structures needed in path-iteration kernels.
21 * This is the first kernel in ray-tracing logic.
23 * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
25 * Its input and output are as follows,
27 * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
28 * Un-initialized throughput -------| |--- Initialized throughput
29 * Un-initialized L_transparent ----| |--- Initialized L_transparent
30 * Un-initialized PathRadiance -----| |--- Initialized PathRadiance
31 * Un-initialized Ray --------------| |--- Initialized Ray
32 * Un-initialized PathState --------| |--- Initialized PathState
33 * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
34 * Un-initialized QueueIndex -------| |--- Initialized QueueIndex (to 0)
35 * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false)
36 * Un-initialized ray_state --------| |--- Initialized ray_state
37 * parallel_samples --------------- | |--- Initialized per_sample_output_buffers
38 * rng_state -----------------------| |--- Initialized work_array
39 * data ----------------------------| |--- Initialized work_pool_wgs
40 * start_sample --------------------| |
41 * sx ------------------------------| |
42 * sy ------------------------------| |
43 * sw ------------------------------| |
44 * sh ------------------------------| |
45 * stride --------------------------| |
46 * queuesize -----------------------| |
47 * num_samples ---------------------| |
50 * All slots in queues are initialized to queue empty slot;
51 * The number of elements in the queues is initialized to 0;
53 ccl_device void kernel_data_init(
56 ShaderData *sd_DL_shadow,
58 ccl_global float3 *P_sd,
59 ccl_global float3 *P_sd_DL_shadow,
61 ccl_global float3 *N_sd,
62 ccl_global float3 *N_sd_DL_shadow,
64 ccl_global float3 *Ng_sd,
65 ccl_global float3 *Ng_sd_DL_shadow,
67 ccl_global float3 *I_sd,
68 ccl_global float3 *I_sd_DL_shadow,
70 ccl_global int *shader_sd,
71 ccl_global int *shader_sd_DL_shadow,
73 ccl_global int *flag_sd,
74 ccl_global int *flag_sd_DL_shadow,
76 ccl_global int *prim_sd,
77 ccl_global int *prim_sd_DL_shadow,
79 ccl_global int *type_sd,
80 ccl_global int *type_sd_DL_shadow,
82 ccl_global float *u_sd,
83 ccl_global float *u_sd_DL_shadow,
85 ccl_global float *v_sd,
86 ccl_global float *v_sd_DL_shadow,
88 ccl_global int *object_sd,
89 ccl_global int *object_sd_DL_shadow,
91 ccl_global float *time_sd,
92 ccl_global float *time_sd_DL_shadow,
94 ccl_global float *ray_length_sd,
95 ccl_global float *ray_length_sd_DL_shadow,
97 /* Ray differentials. */
98 ccl_global differential3 *dP_sd,
99 ccl_global differential3 *dP_sd_DL_shadow,
101 ccl_global differential3 *dI_sd,
102 ccl_global differential3 *dI_sd_DL_shadow,
104 ccl_global differential *du_sd,
105 ccl_global differential *du_sd_DL_shadow,
107 ccl_global differential *dv_sd,
108 ccl_global differential *dv_sd_DL_shadow,
111 ccl_global float3 *dPdu_sd,
112 ccl_global float3 *dPdu_sd_DL_shadow,
114 ccl_global float3 *dPdv_sd,
115 ccl_global float3 *dPdv_sd_DL_shadow,
118 ccl_global Transform *ob_tfm_sd,
119 ccl_global Transform *ob_tfm_sd_DL_shadow,
121 ccl_global Transform *ob_itfm_sd,
122 ccl_global Transform *ob_itfm_sd_DL_shadow,
124 ShaderClosure *closure_sd,
125 ShaderClosure *closure_sd_DL_shadow,
127 ccl_global int *num_closure_sd,
128 ccl_global int *num_closure_sd_DL_shadow,
130 ccl_global float *randb_closure_sd,
131 ccl_global float *randb_closure_sd_DL_shadow,
133 ccl_global float3 *ray_P_sd,
134 ccl_global float3 *ray_P_sd_DL_shadow,
136 ccl_global differential3 *ray_dP_sd,
137 ccl_global differential3 *ray_dP_sd_DL_shadow,
139 ccl_constant KernelData *data,
140 ccl_global float *per_sample_output_buffers,
141 ccl_global uint *rng_state,
142 ccl_global uint *rng_coop, /* rng array to store rng values for all rays */
143 ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */
144 ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */
145 PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */
146 ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */
147 ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */
148 ccl_global char *ray_state, /* Stores information on current state of a ray */
150 #define KERNEL_TEX(type, ttype, name) \
151 ccl_global type *name,
152 #include "../kernel_textures.h"
154 int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
155 int rng_state_offset_x,
156 int rng_state_offset_y,
157 int rng_state_stride,
158 ccl_global int *Queue_data, /* Memory for queues */
159 ccl_global int *Queue_index, /* Tracks the number of elements in queues */
160 int queuesize, /* size (capacity) of the queue */
161 ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
162 ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */
163 #ifdef __WORK_STEALING__
164 ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */
165 unsigned int num_samples, /* Total number of samples per pixel */
167 #ifdef __KERNEL_DEBUG__
168 DebugData *debugdata_coop,
170 int parallel_samples) /* Number of samples to be processed in parallel */
173 #define KERNEL_TEX(type, ttype, name) \
175 #include "../kernel_textures.h"
178 sd_DL_shadow->P = P_sd_DL_shadow;
181 sd_DL_shadow->N = N_sd_DL_shadow;
184 sd_DL_shadow->Ng = Ng_sd_DL_shadow;
187 sd_DL_shadow->I = I_sd_DL_shadow;
189 sd->shader = shader_sd;
190 sd_DL_shadow->shader = shader_sd_DL_shadow;
193 sd_DL_shadow->flag = flag_sd_DL_shadow;
196 sd_DL_shadow->prim = prim_sd_DL_shadow;
199 sd_DL_shadow->type = type_sd_DL_shadow;
202 sd_DL_shadow->u = u_sd_DL_shadow;
205 sd_DL_shadow->v = v_sd_DL_shadow;
207 sd->object = object_sd;
208 sd_DL_shadow->object = object_sd_DL_shadow;
211 sd_DL_shadow->time = time_sd_DL_shadow;
213 sd->ray_length = ray_length_sd;
214 sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
216 #ifdef __RAY_DIFFERENTIALS__
218 sd_DL_shadow->dP = dP_sd_DL_shadow;
221 sd_DL_shadow->dI = dI_sd_DL_shadow;
224 sd_DL_shadow->du = du_sd_DL_shadow;
227 sd_DL_shadow->dv = dv_sd_DL_shadow;
230 sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
233 sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
237 #ifdef __OBJECT_MOTION__
238 sd->ob_tfm = ob_tfm_sd;
239 sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow;
241 sd->ob_itfm = ob_itfm_sd;
242 sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow;
245 sd->closure = closure_sd;
246 sd_DL_shadow->closure = closure_sd_DL_shadow;
248 sd->num_closure = num_closure_sd;
249 sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
251 sd->randb_closure = randb_closure_sd;
252 sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
254 sd->ray_P = ray_P_sd;
255 sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
257 sd->ray_dP = ray_dP_sd;
258 sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
260 int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
262 #ifdef __WORK_STEALING__
263 int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
264 /* Initialize work_pool_wgs */
266 int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
267 work_pool_wgs[group_index] = 0;
269 barrier(CLK_LOCAL_MEM_FENCE);
270 #endif /* __WORK_STEALING__ */
272 /* Initialize queue data and queue index. */
273 if(thread_index < queuesize) {
274 /* Initialize active ray queue. */
275 Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
276 /* Initialize background and buffer update queue. */
277 Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
278 /* Initialize shadow ray cast of AO queue. */
279 Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
280 /* Initialize shadow ray cast of direct lighting queue. */
281 Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
284 if(thread_index == 0) {
285 Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
286 Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
287 Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
288 Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
289 /* The scene-intersect kernel should not use the queues very first time.
290 * since the queue would be empty.
292 use_queues_flag[0] = 0;
295 int x = get_global_id(0);
296 int y = get_global_id(1);
298 if(x < (sw * parallel_samples) && y < sh) {
299 int ray_index = x + y * (sw * parallel_samples);
301 /* This is the first assignment to ray_state;
302 * So we dont use ASSIGN_RAY_STATE macro.
304 ray_state[ray_index] = RAY_ACTIVE;
306 unsigned int my_sample;
307 unsigned int pixel_x;
308 unsigned int pixel_y;
311 unsigned int my_sample_tile;
313 #ifdef __WORK_STEALING__
314 unsigned int my_work = 0;
316 get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
317 /* Get the sample associated with the work. */
318 my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
322 /* Get pixel and tile position associated with the work. */
323 get_pixel_tile_position(&pixel_x, &pixel_y,
329 work_array[ray_index] = my_work;
330 #else /* __WORK_STEALING__ */
331 unsigned int tile_index = ray_index / parallel_samples;
332 tile_x = tile_index % sw;
333 tile_y = tile_index / sw;
334 my_sample_tile = ray_index - (tile_index * parallel_samples);
335 my_sample = my_sample_tile + start_sample;
337 /* Initialize work array. */
338 work_array[ray_index] = my_sample ;
340 /* Calculate pixel position of this ray. */
341 pixel_x = sx + tile_x;
342 pixel_y = sy + tile_y;
343 #endif /* __WORK_STEALING__ */
345 rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
347 /* Initialise per_sample_output_buffers to all zeros. */
348 per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
349 int per_sample_output_buffers_iterator = 0;
350 for(per_sample_output_buffers_iterator = 0;
351 per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
352 per_sample_output_buffers_iterator++)
354 per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
357 /* Initialize random numbers and ray. */
358 kernel_path_trace_setup(kg,
362 &rng_coop[ray_index],
363 &Ray_coop[ray_index]);
365 if(Ray_coop[ray_index].t != 0.0f) {
366 /* Initialize throughput, L_transparent, Ray, PathState;
367 * These rays proceed with path-iteration.
369 throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
370 L_transparent_coop[ray_index] = 0.0f;
371 path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
373 &PathState_coop[ray_index],
374 &rng_coop[ray_index],
376 &Ray_coop[ray_index]);
377 #ifdef __KERNEL_DEBUG__
378 debug_data_init(&debugdata_coop[ray_index]);
381 /* These rays do not participate in path-iteration. */
382 float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
383 /* Accumulate result in output buffer. */
384 kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
385 path_rng_end(kg, rng_state, rng_coop[ray_index]);
386 ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
390 /* Mark rest of the ray-state indices as RAY_INACTIVE. */
391 if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
392 /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
393 ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;