Cycles: Remove few function arguments needed only for the split kernel
[blender.git] / intern / cycles / kernel / split / kernel_data_init.h
1 /*
2  * Copyright 2011-2015 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "kernel_split_common.h"
18
19 /* Note on kernel_data_initialization kernel
20  * This kernel Initializes structures needed in path-iteration kernels.
21  * This is the first kernel in ray-tracing logic.
22  *
23  * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
24  *
25  * Its input and output are as follows,
26  *
27  * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
28  * Un-initialized throughput -------|                                  |--- Initialized throughput
29  * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
30  * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
31  * Un-initialized Ray --------------|                                  |--- Initialized Ray
32  * Un-initialized PathState --------|                                  |--- Initialized PathState
33  * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
34  * Un-initialized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
35  * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
36  * Un-initialized ray_state --------|                                  |--- Initialized ray_state
37  * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
38  * rng_state -----------------------|                                  |--- Initialized work_array
39  * data ----------------------------|                                  |--- Initialized work_pool_wgs
40  * start_sample --------------------|                                  |
41  * sx ------------------------------|                                  |
42  * sy ------------------------------|                                  |
43  * sw ------------------------------|                                  |
44  * sh ------------------------------|                                  |
45  * stride --------------------------|                                  |
46  * queuesize -----------------------|                                  |
47  * num_samples ---------------------|                                  |
48  *
49  * Note on Queues :
50  * All slots in queues are initialized to queue empty slot;
51  * The number of elements in the queues is initialized to 0;
52  */
53 ccl_device void kernel_data_init(
54         KernelGlobals *kg,
55         ShaderData *sd,
56         ShaderData *sd_DL_shadow,
57
58         ccl_global float3 *P_sd,
59         ccl_global float3 *P_sd_DL_shadow,
60
61         ccl_global float3 *N_sd,
62         ccl_global float3 *N_sd_DL_shadow,
63
64         ccl_global float3 *Ng_sd,
65         ccl_global float3 *Ng_sd_DL_shadow,
66
67         ccl_global float3 *I_sd,
68         ccl_global float3 *I_sd_DL_shadow,
69
70         ccl_global int *shader_sd,
71         ccl_global int *shader_sd_DL_shadow,
72
73         ccl_global int *flag_sd,
74         ccl_global int *flag_sd_DL_shadow,
75
76         ccl_global int *prim_sd,
77         ccl_global int *prim_sd_DL_shadow,
78
79         ccl_global int *type_sd,
80         ccl_global int *type_sd_DL_shadow,
81
82         ccl_global float *u_sd,
83         ccl_global float *u_sd_DL_shadow,
84
85         ccl_global float *v_sd,
86         ccl_global float *v_sd_DL_shadow,
87
88         ccl_global int *object_sd,
89         ccl_global int *object_sd_DL_shadow,
90
91         ccl_global float *time_sd,
92         ccl_global float *time_sd_DL_shadow,
93
94         ccl_global float *ray_length_sd,
95         ccl_global float *ray_length_sd_DL_shadow,
96
97         /* Ray differentials. */
98         ccl_global differential3 *dP_sd,
99         ccl_global differential3 *dP_sd_DL_shadow,
100
101         ccl_global differential3 *dI_sd,
102         ccl_global differential3 *dI_sd_DL_shadow,
103
104         ccl_global differential *du_sd,
105         ccl_global differential *du_sd_DL_shadow,
106
107         ccl_global differential *dv_sd,
108         ccl_global differential *dv_sd_DL_shadow,
109
110         /* Dp/Du */
111         ccl_global float3 *dPdu_sd,
112         ccl_global float3 *dPdu_sd_DL_shadow,
113
114         ccl_global float3 *dPdv_sd,
115         ccl_global float3 *dPdv_sd_DL_shadow,
116
117         /* Object motion. */
118         ccl_global Transform *ob_tfm_sd,
119         ccl_global Transform *ob_tfm_sd_DL_shadow,
120
121         ccl_global Transform *ob_itfm_sd,
122         ccl_global Transform *ob_itfm_sd_DL_shadow,
123
124         ShaderClosure *closure_sd,
125         ShaderClosure *closure_sd_DL_shadow,
126
127         ccl_global int *num_closure_sd,
128         ccl_global int *num_closure_sd_DL_shadow,
129
130         ccl_global float *randb_closure_sd,
131         ccl_global float *randb_closure_sd_DL_shadow,
132
133         ccl_global float3 *ray_P_sd,
134         ccl_global float3 *ray_P_sd_DL_shadow,
135
136         ccl_global differential3 *ray_dP_sd,
137         ccl_global differential3 *ray_dP_sd_DL_shadow,
138
139         ccl_constant KernelData *data,
140         ccl_global float *per_sample_output_buffers,
141         ccl_global uint *rng_state,
142         ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
143         ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
144         ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
145         PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
146         ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
147         ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
148         Intersection *Intersection_coop_shadow,
149         ccl_global char *ray_state,                  /* Stores information on current state of a ray */
150
151 #define KERNEL_TEX(type, ttype, name)                                   \
152         ccl_global type *name,
153 #include "../kernel_textures.h"
154
155         int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
156         int rng_state_offset_x,
157         int rng_state_offset_y,
158         int rng_state_stride,
159         ccl_global int *Queue_data,                  /* Memory for queues */
160         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
161         int queuesize,                               /* size (capacity) of the queue */
162         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
163         ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
164 #ifdef __WORK_STEALING__
165         ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
166         unsigned int num_samples,                    /* Total number of samples per pixel */
167 #endif
168 #ifdef __KERNEL_DEBUG__
169         DebugData *debugdata_coop,
170 #endif
171         int parallel_samples)                        /* Number of samples to be processed in parallel */
172 {
173         kg->data = data;
174         kg->sd_input = sd_DL_shadow;
175         kg->isect_shadow = Intersection_coop_shadow;
176 #define KERNEL_TEX(type, ttype, name) \
177         kg->name = name;
178 #include "../kernel_textures.h"
179
180         sd->P = P_sd;
181         sd_DL_shadow->P = P_sd_DL_shadow;
182
183         sd->N = N_sd;
184         sd_DL_shadow->N = N_sd_DL_shadow;
185
186         sd->Ng = Ng_sd;
187         sd_DL_shadow->Ng = Ng_sd_DL_shadow;
188
189         sd->I = I_sd;
190         sd_DL_shadow->I = I_sd_DL_shadow;
191
192         sd->shader = shader_sd;
193         sd_DL_shadow->shader = shader_sd_DL_shadow;
194
195         sd->flag = flag_sd;
196         sd_DL_shadow->flag = flag_sd_DL_shadow;
197
198         sd->prim = prim_sd;
199         sd_DL_shadow->prim = prim_sd_DL_shadow;
200
201         sd->type = type_sd;
202         sd_DL_shadow->type = type_sd_DL_shadow;
203
204         sd->u = u_sd;
205         sd_DL_shadow->u = u_sd_DL_shadow;
206
207         sd->v = v_sd;
208         sd_DL_shadow->v = v_sd_DL_shadow;
209
210         sd->object = object_sd;
211         sd_DL_shadow->object = object_sd_DL_shadow;
212
213         sd->time = time_sd;
214         sd_DL_shadow->time = time_sd_DL_shadow;
215
216         sd->ray_length = ray_length_sd;
217         sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
218
219 #ifdef __RAY_DIFFERENTIALS__
220         sd->dP = dP_sd;
221         sd_DL_shadow->dP = dP_sd_DL_shadow;
222
223         sd->dI = dI_sd;
224         sd_DL_shadow->dI = dI_sd_DL_shadow;
225
226         sd->du = du_sd;
227         sd_DL_shadow->du = du_sd_DL_shadow;
228
229         sd->dv = dv_sd;
230         sd_DL_shadow->dv = dv_sd_DL_shadow;
231 #ifdef __DPDU__
232         sd->dPdu = dPdu_sd;
233         sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
234
235         sd->dPdv = dPdv_sd;
236         sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
237 #endif
238 #endif
239
240 #ifdef __OBJECT_MOTION__
241         sd->ob_tfm = ob_tfm_sd;
242         sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow;
243
244         sd->ob_itfm = ob_itfm_sd;
245         sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow;
246 #endif
247
248         sd->closure = closure_sd;
249         sd_DL_shadow->closure = closure_sd_DL_shadow;
250
251         sd->num_closure = num_closure_sd;
252         sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
253
254         sd->randb_closure = randb_closure_sd;
255         sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
256
257         sd->ray_P = ray_P_sd;
258         sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
259
260         sd->ray_dP = ray_dP_sd;
261         sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
262
263         int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
264
265 #ifdef __WORK_STEALING__
266         int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
267         /* Initialize work_pool_wgs */
268         if(lid == 0) {
269                 int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
270                 work_pool_wgs[group_index] = 0;
271         }
272         barrier(CLK_LOCAL_MEM_FENCE);
273 #endif  /* __WORK_STEALING__ */
274
275         /* Initialize queue data and queue index. */
276         if(thread_index < queuesize) {
277                 /* Initialize active ray queue. */
278                 Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
279                 /* Initialize background and buffer update queue. */
280                 Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
281                 /* Initialize shadow ray cast of AO queue. */
282                 Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
283                 /* Initialize shadow ray cast of direct lighting queue. */
284                 Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
285         }
286
287         if(thread_index == 0) {
288                 Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
289                 Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
290                 Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
291                 Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
292                 /* The scene-intersect kernel should not use the queues very first time.
293                  * since the queue would be empty.
294                  */
295                 use_queues_flag[0] = 0;
296         }
297
298         int x = get_global_id(0);
299         int y = get_global_id(1);
300
301         if(x < (sw * parallel_samples) && y < sh) {
302                 int ray_index = x + y * (sw * parallel_samples);
303
304                 /* This is the first assignment to ray_state;
305                  * So we dont use ASSIGN_RAY_STATE macro.
306                  */
307                 ray_state[ray_index] = RAY_ACTIVE;
308
309                 unsigned int my_sample;
310                 unsigned int pixel_x;
311                 unsigned int pixel_y;
312                 unsigned int tile_x;
313                 unsigned int tile_y;
314                 unsigned int my_sample_tile;
315
316 #ifdef __WORK_STEALING__
317                 unsigned int my_work = 0;
318                 /* Get work. */
319                 get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
320                 /* Get the sample associated with the work. */
321                 my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
322
323                 my_sample_tile = 0;
324
325                 /* Get pixel and tile position associated with the work. */
326                 get_pixel_tile_position(&pixel_x, &pixel_y,
327                                         &tile_x, &tile_y,
328                                         my_work,
329                                         sw, sh, sx, sy,
330                                         parallel_samples,
331                                         ray_index);
332                 work_array[ray_index] = my_work;
333 #else  /* __WORK_STEALING__ */
334                 unsigned int tile_index = ray_index / parallel_samples;
335                 tile_x = tile_index % sw;
336                 tile_y = tile_index / sw;
337                 my_sample_tile = ray_index - (tile_index * parallel_samples);
338                 my_sample = my_sample_tile + start_sample;
339
340                 /* Initialize work array. */
341                 work_array[ray_index] = my_sample ;
342
343                 /* Calculate pixel position of this ray. */
344                 pixel_x = sx + tile_x;
345                 pixel_y = sy + tile_y;
346 #endif  /* __WORK_STEALING__ */
347
348                 rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
349
350                 /* Initialise per_sample_output_buffers to all zeros. */
351                 per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
352                 int per_sample_output_buffers_iterator = 0;
353                 for(per_sample_output_buffers_iterator = 0;
354                     per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
355                     per_sample_output_buffers_iterator++)
356                 {
357                         per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
358                 }
359
360                 /* Initialize random numbers and ray. */
361                 kernel_path_trace_setup(kg,
362                                         rng_state,
363                                         my_sample,
364                                         pixel_x, pixel_y,
365                                         &rng_coop[ray_index],
366                                         &Ray_coop[ray_index]);
367
368                 if(Ray_coop[ray_index].t != 0.0f) {
369                         /* Initialize throughput, L_transparent, Ray, PathState;
370                          * These rays proceed with path-iteration.
371                          */
372                         throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
373                         L_transparent_coop[ray_index] = 0.0f;
374                         path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
375                         path_state_init(kg,
376                                         &PathState_coop[ray_index],
377                                         &rng_coop[ray_index],
378                                         my_sample,
379                                         &Ray_coop[ray_index]);
380 #ifdef __KERNEL_DEBUG__
381                         debug_data_init(&debugdata_coop[ray_index]);
382 #endif
383                 } else {
384                         /* These rays do not participate in path-iteration. */
385                         float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
386                         /* Accumulate result in output buffer. */
387                         kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
388                         path_rng_end(kg, rng_state, rng_coop[ray_index]);
389                         ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
390                 }
391         }
392
393         /* Mark rest of the ray-state indices as RAY_INACTIVE. */
394         if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
395                 /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
396                 ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
397         }
398 }