Cycles: Make shadow catcher an optional feature for OpenCL
[blender.git] / intern / cycles / device / device_split_kernel.cpp
1 /*
2  * Copyright 2011-2016 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "device_split_kernel.h"
18
19 #include "kernel_types.h"
20 #include "kernel_split_data_types.h"
21
22 #include "util_time.h"
23
24 CCL_NAMESPACE_BEGIN
25
26 static const double alpha = 0.1; /* alpha for rolling average */
27
28 DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
29 {
30         current_max_closure = -1;
31         first_tile = true;
32
33         avg_time_per_sample = 0.0;
34
35         kernel_path_init = NULL;
36         kernel_scene_intersect = NULL;
37         kernel_lamp_emission = NULL;
38         kernel_do_volume = NULL;
39         kernel_queue_enqueue = NULL;
40         kernel_indirect_background = NULL;
41         kernel_shader_eval = NULL;
42         kernel_holdout_emission_blurring_pathtermination_ao = NULL;
43         kernel_subsurface_scatter = NULL;
44         kernel_direct_lighting = NULL;
45         kernel_shadow_blocked_ao = NULL;
46         kernel_shadow_blocked_dl = NULL;
47         kernel_next_iteration_setup = NULL;
48         kernel_indirect_subsurface = NULL;
49         kernel_buffer_update = NULL;
50 }
51
52 DeviceSplitKernel::~DeviceSplitKernel()
53 {
54         device->mem_free(split_data);
55         device->mem_free(ray_state);
56         device->mem_free(use_queues_flag);
57         device->mem_free(queue_index);
58         device->mem_free(work_pool_wgs);
59
60         delete kernel_path_init;
61         delete kernel_scene_intersect;
62         delete kernel_lamp_emission;
63         delete kernel_do_volume;
64         delete kernel_queue_enqueue;
65         delete kernel_indirect_background;
66         delete kernel_shader_eval;
67         delete kernel_holdout_emission_blurring_pathtermination_ao;
68         delete kernel_subsurface_scatter;
69         delete kernel_direct_lighting;
70         delete kernel_shadow_blocked_ao;
71         delete kernel_shadow_blocked_dl;
72         delete kernel_next_iteration_setup;
73         delete kernel_indirect_subsurface;
74         delete kernel_buffer_update;
75 }
76
77 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
78 {
79 #define LOAD_KERNEL(name) \
80                 kernel_##name = get_split_kernel_function(#name, requested_features); \
81                 if(!kernel_##name) { \
82                         return false; \
83                 }
84
85         LOAD_KERNEL(path_init);
86         LOAD_KERNEL(scene_intersect);
87         LOAD_KERNEL(lamp_emission);
88         LOAD_KERNEL(do_volume);
89         LOAD_KERNEL(queue_enqueue);
90         LOAD_KERNEL(indirect_background);
91         LOAD_KERNEL(shader_eval);
92         LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
93         LOAD_KERNEL(subsurface_scatter);
94         LOAD_KERNEL(direct_lighting);
95         LOAD_KERNEL(shadow_blocked_ao);
96         LOAD_KERNEL(shadow_blocked_dl);
97         LOAD_KERNEL(next_iteration_setup);
98         LOAD_KERNEL(indirect_subsurface);
99         LOAD_KERNEL(buffer_update);
100
101 #undef LOAD_KERNEL
102
103         current_max_closure = requested_features.max_closure;
104
105         return true;
106 }
107
108 size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
109 {
110         uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
111         return max_buffer_size / size_per_element;
112 }
113
114 bool DeviceSplitKernel::path_trace(DeviceTask *task,
115                                    RenderTile& tile,
116                                    device_memory& kgbuffer,
117                                    device_memory& kernel_data)
118 {
119         if(device->have_error()) {
120                 return false;
121         }
122
123         /* Get local size */
124         size_t local_size[2];
125         {
126                 int2 lsize = split_kernel_local_size();
127                 local_size[0] = lsize[0];
128                 local_size[1] = lsize[1];
129         }
130
131         /* Set gloabl size */
132         size_t global_size[2];
133         {
134                 int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
135
136                 /* Make sure that set work size is a multiple of local
137                  * work size dimensions.
138                  */
139                 global_size[0] = round_up(gsize[0], local_size[0]);
140                 global_size[1] = round_up(gsize[1], local_size[1]);
141         }
142
143         /* Number of elements in the global state buffer */
144         int num_global_elements = global_size[0] * global_size[1];
145         assert(num_global_elements % WORK_POOL_SIZE == 0);
146
147         /* Allocate all required global memory once. */
148         if(first_tile) {
149                 first_tile = false;
150
151                 /* Calculate max groups */
152
153                 /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
154                 unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1;
155
156                 /* Allocate work_pool_wgs memory. */
157                 work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
158                 device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
159
160                 queue_index.resize(NUM_QUEUES * sizeof(int));
161                 device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
162
163                 use_queues_flag.resize(sizeof(char));
164                 device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
165
166                 ray_state.resize(num_global_elements);
167                 device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
168
169                 split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
170                 device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
171         }
172
173 #define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
174                 if(device->have_error()) { \
175                         return false; \
176                 } \
177                 if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
178                         return false; \
179                 }
180
181         tile.sample = tile.start_sample;
182
183         /* for exponential increase between tile updates */
184         int time_multiplier = 1;
185
186         while(tile.sample < tile.start_sample + tile.num_samples) {
187                 /* to keep track of how long it takes to run a number of samples */
188                 double start_time = time_dt();
189
190                 /* initial guess to start rolling average */
191                 const int initial_num_samples = 1;
192                 /* approx number of samples per second */
193                 int samples_per_second = (avg_time_per_sample > 0.0) ?
194                                          int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
195
196                 RenderTile subtile = tile;
197                 subtile.start_sample = tile.sample;
198                 subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
199
200                 if(device->have_error()) {
201                         return false;
202                 }
203
204                 /* reset state memory here as global size for data_init
205                  * kernel might not be large enough to do in kernel
206                  */
207                 device->mem_zero(work_pool_wgs);
208                 device->mem_zero(split_data);
209                 device->mem_zero(ray_state);
210
211                 if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
212                                                    subtile,
213                                                    num_global_elements,
214                                                    kgbuffer,
215                                                    kernel_data,
216                                                    split_data,
217                                                    ray_state,
218                                                    queue_index,
219                                                    use_queues_flag,
220                                                    work_pool_wgs))
221                 {
222                         return false;
223                 }
224
225                 ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
226
227                 bool activeRaysAvailable = true;
228
229                 while(activeRaysAvailable) {
230                         /* Do path-iteration in host [Enqueue Path-iteration kernels. */
231                         for(int PathIter = 0; PathIter < 16; PathIter++) {
232                                 ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
233                                 ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
234                                 ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
235                                 ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
236                                 ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
237                                 ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
238                                 ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
239                                 ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
240                                 ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
241                                 ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
242                                 ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
243                                 ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
244                                 ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
245                                 ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
246                                 ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
247
248                                 if(task->get_cancel()) {
249                                         return true;
250                                 }
251                         }
252
253                         /* Decide if we should exit path-iteration in host. */
254                         device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
255
256                         activeRaysAvailable = false;
257
258                         for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
259                                 int8_t state = ray_state.get_data()[rayStateIter];
260
261                                 if(state != RAY_INACTIVE) {
262                                         if(state == RAY_INVALID) {
263                                                 /* Something went wrong, abort to avoid looping endlessly. */
264                                                 device->set_error("Split kernel error: invalid ray state");
265                                                 return false;
266                                         }
267
268                                         /* Not all rays are RAY_INACTIVE. */
269                                         activeRaysAvailable = true;
270                                         break;
271                                 }
272                         }
273
274                         if(task->get_cancel()) {
275                                 return true;
276                         }
277                 }
278
279                 double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
280
281                 if(avg_time_per_sample == 0.0) {
282                         /* start rolling average */
283                         avg_time_per_sample = time_per_sample;
284                 }
285                 else {
286                         avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
287                 }
288
289 #undef ENQUEUE_SPLIT_KERNEL
290
291                 tile.sample += subtile.num_samples;
292                 task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
293
294                 time_multiplier = min(time_multiplier << 1, 10);
295
296                 if(task->get_cancel()) {
297                         return true;
298                 }
299         }
300
301         return true;
302 }
303
304 CCL_NAMESPACE_END
305
306