5283bd60bd5420d465662a8bd1c73ebd6ea1c9f6
[blender-staging.git] / intern / cycles / device / device_split_kernel.cpp
1 /*
2  * Copyright 2011-2016 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "device/device_split_kernel.h"
18
19 #include "kernel/kernel_types.h"
20 #include "kernel/split/kernel_split_data_types.h"
21
22 #include "util/util_logging.h"
23 #include "util/util_time.h"
24
25 CCL_NAMESPACE_BEGIN
26
27 static const double alpha = 0.1; /* alpha for rolling average */
28
29 DeviceSplitKernel::DeviceSplitKernel(Device *device)
30 : device(device),
31   split_data(device, "split_data", MEM_READ_WRITE),
32   ray_state(device, "ray_state", MEM_READ_WRITE),
33   queue_index(device, "queue_index"),
34   use_queues_flag(device, "use_queues_flag"),
35   work_pool_wgs(device, "work_pool_wgs")
36 {
37         current_max_closure = -1;
38         first_tile = true;
39
40         avg_time_per_sample = 0.0;
41
42         kernel_path_init = NULL;
43         kernel_scene_intersect = NULL;
44         kernel_lamp_emission = NULL;
45         kernel_do_volume = NULL;
46         kernel_queue_enqueue = NULL;
47         kernel_indirect_background = NULL;
48         kernel_shader_setup = NULL;
49         kernel_shader_sort = NULL;
50         kernel_shader_eval = NULL;
51         kernel_holdout_emission_blurring_pathtermination_ao = NULL;
52         kernel_subsurface_scatter = NULL;
53         kernel_direct_lighting = NULL;
54         kernel_shadow_blocked_ao = NULL;
55         kernel_shadow_blocked_dl = NULL;
56         kernel_enqueue_inactive = NULL;
57         kernel_next_iteration_setup = NULL;
58         kernel_indirect_subsurface = NULL;
59         kernel_buffer_update = NULL;
60 }
61
62 DeviceSplitKernel::~DeviceSplitKernel()
63 {
64         device->mem_free(split_data);
65         device->mem_free(ray_state);
66         device->mem_free(use_queues_flag);
67         device->mem_free(queue_index);
68         device->mem_free(work_pool_wgs);
69
70         delete kernel_path_init;
71         delete kernel_scene_intersect;
72         delete kernel_lamp_emission;
73         delete kernel_do_volume;
74         delete kernel_queue_enqueue;
75         delete kernel_indirect_background;
76         delete kernel_shader_setup;
77         delete kernel_shader_sort;
78         delete kernel_shader_eval;
79         delete kernel_holdout_emission_blurring_pathtermination_ao;
80         delete kernel_subsurface_scatter;
81         delete kernel_direct_lighting;
82         delete kernel_shadow_blocked_ao;
83         delete kernel_shadow_blocked_dl;
84         delete kernel_enqueue_inactive;
85         delete kernel_next_iteration_setup;
86         delete kernel_indirect_subsurface;
87         delete kernel_buffer_update;
88 }
89
90 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
91 {
92 #define LOAD_KERNEL(name) \
93                 kernel_##name = get_split_kernel_function(#name, requested_features); \
94                 if(!kernel_##name) { \
95                         return false; \
96                 }
97
98         LOAD_KERNEL(path_init);
99         LOAD_KERNEL(scene_intersect);
100         LOAD_KERNEL(lamp_emission);
101         LOAD_KERNEL(do_volume);
102         LOAD_KERNEL(queue_enqueue);
103         LOAD_KERNEL(indirect_background);
104         LOAD_KERNEL(shader_setup);
105         LOAD_KERNEL(shader_sort);
106         LOAD_KERNEL(shader_eval);
107         LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
108         LOAD_KERNEL(subsurface_scatter);
109         LOAD_KERNEL(direct_lighting);
110         LOAD_KERNEL(shadow_blocked_ao);
111         LOAD_KERNEL(shadow_blocked_dl);
112         LOAD_KERNEL(enqueue_inactive);
113         LOAD_KERNEL(next_iteration_setup);
114         LOAD_KERNEL(indirect_subsurface);
115         LOAD_KERNEL(buffer_update);
116
117 #undef LOAD_KERNEL
118
119         current_max_closure = requested_features.max_closure;
120
121         return true;
122 }
123
124 size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
125 {
126         uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
127         VLOG(1) << "Split state element size: "
128                 << string_human_readable_number(size_per_element) << " bytes. ("
129                 << string_human_readable_size(size_per_element) << ").";
130         return max_buffer_size / size_per_element;
131 }
132
133 bool DeviceSplitKernel::path_trace(DeviceTask *task,
134                                    RenderTile& tile,
135                                    device_memory& kgbuffer,
136                                    device_memory& kernel_data)
137 {
138         if(device->have_error()) {
139                 return false;
140         }
141
142         /* Get local size */
143         size_t local_size[2];
144         {
145                 int2 lsize = split_kernel_local_size();
146                 local_size[0] = lsize[0];
147                 local_size[1] = lsize[1];
148         }
149
150         /* Number of elements in the global state buffer */
151         int num_global_elements = global_size[0] * global_size[1];
152
153         /* Allocate all required global memory once. */
154         if(first_tile) {
155                 first_tile = false;
156
157                 /* Set gloabl size */
158                 {
159                         int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
160
161                         /* Make sure that set work size is a multiple of local
162                          * work size dimensions.
163                          */
164                         global_size[0] = round_up(gsize[0], local_size[0]);
165                         global_size[1] = round_up(gsize[1], local_size[1]);
166                 }
167
168                 num_global_elements = global_size[0] * global_size[1];
169                 assert(num_global_elements % WORK_POOL_SIZE == 0);
170
171                 /* Calculate max groups */
172
173                 /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
174                 unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : WORK_POOL_SIZE_GPU;
175                 unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
176
177                 /* Allocate work_pool_wgs memory. */
178                 work_pool_wgs.resize(max_work_groups);
179                 device->mem_alloc(work_pool_wgs);
180
181                 queue_index.resize(NUM_QUEUES);
182                 device->mem_alloc(queue_index);
183
184                 use_queues_flag.resize(1);
185                 device->mem_alloc(use_queues_flag);
186
187                 ray_state.resize(num_global_elements);
188                 device->mem_alloc(ray_state);
189
190                 split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
191                 device->mem_alloc(split_data);
192         }
193
194 #define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
195                 if(device->have_error()) { \
196                         return false; \
197                 } \
198                 if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
199                         return false; \
200                 }
201
202         tile.sample = tile.start_sample;
203
204         /* for exponential increase between tile updates */
205         int time_multiplier = 1;
206
207         while(tile.sample < tile.start_sample + tile.num_samples) {
208                 /* to keep track of how long it takes to run a number of samples */
209                 double start_time = time_dt();
210
211                 /* initial guess to start rolling average */
212                 const int initial_num_samples = 1;
213                 /* approx number of samples per second */
214                 int samples_per_second = (avg_time_per_sample > 0.0) ?
215                                          int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
216
217                 RenderTile subtile = tile;
218                 subtile.start_sample = tile.sample;
219                 subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
220
221                 if(device->have_error()) {
222                         return false;
223                 }
224
225                 /* reset state memory here as global size for data_init
226                  * kernel might not be large enough to do in kernel
227                  */
228                 device->mem_zero(work_pool_wgs);
229                 device->mem_zero(split_data);
230                 device->mem_zero(ray_state);
231
232                 if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
233                                                    subtile,
234                                                    num_global_elements,
235                                                    kgbuffer,
236                                                    kernel_data,
237                                                    split_data,
238                                                    ray_state,
239                                                    queue_index,
240                                                    use_queues_flag,
241                                                    work_pool_wgs))
242                 {
243                         return false;
244                 }
245
246                 ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
247
248                 bool activeRaysAvailable = true;
249                 double cancel_time = DBL_MAX;
250
251                 while(activeRaysAvailable) {
252                         /* Do path-iteration in host [Enqueue Path-iteration kernels. */
253                         for(int PathIter = 0; PathIter < 16; PathIter++) {
254                                 ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
255                                 ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
256                                 ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
257                                 ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
258                                 ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
259                                 ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
260                                 ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
261                                 ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
262                                 ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
263                                 ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
264                                 ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
265                                 ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
266                                 ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
267                                 ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
268                                 ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
269                                 ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
270                                 ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
271                                 ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
272                                 ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
273
274                                 if(task->get_cancel() && cancel_time == DBL_MAX) {
275                                         /* Wait up to twice as many seconds for current samples to finish 
276                                          * to avoid artifacts in render result from ending too soon.
277                                          */
278                                         cancel_time = time_dt() + 2.0 * time_multiplier;
279                                 }
280
281                                 if(time_dt() > cancel_time) {
282                                         return true;
283                                 }
284                         }
285
286                         /* Decide if we should exit path-iteration in host. */
287                         device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
288
289                         activeRaysAvailable = false;
290
291                         for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
292                                 if(!IS_STATE(ray_state.get_data(), rayStateIter, RAY_INACTIVE)) {
293                                         if(IS_STATE(ray_state.get_data(), rayStateIter, RAY_INVALID)) {
294                                                 /* Something went wrong, abort to avoid looping endlessly. */
295                                                 device->set_error("Split kernel error: invalid ray state");
296                                                 return false;
297                                         }
298
299                                         /* Not all rays are RAY_INACTIVE. */
300                                         activeRaysAvailable = true;
301                                         break;
302                                 }
303                         }
304
305                         if(time_dt() > cancel_time) {
306                                 return true;
307                         }
308                 }
309
310                 double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
311
312                 if(avg_time_per_sample == 0.0) {
313                         /* start rolling average */
314                         avg_time_per_sample = time_per_sample;
315                 }
316                 else {
317                         avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
318                 }
319
320 #undef ENQUEUE_SPLIT_KERNEL
321
322                 tile.sample += subtile.num_samples;
323                 task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
324
325                 time_multiplier = min(time_multiplier << 1, 10);
326
327                 if(task->get_cancel()) {
328                         return true;
329                 }
330         }
331
332         return true;
333 }
334
335 CCL_NAMESPACE_END
336
337